diff --git a/background.html b/background.html index eae182c..a5f8002 100644 --- a/background.html +++ b/background.html @@ -14,10 +14,10 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

Appendix: Background

1O notation

Throughout this chapter and the rest of the book, we will describe the asymptotic behavior of a function using OO notation.

For two functions f(t)f(t) and g(t)g(t), we say that f(t)O(g(t))f(t) \le O(g(t)) if ff is asymptotically upper bounded by gg. Formally, this means that there exists some constant C>0C > 0 such that f(t)Cg(t)f(t) \le C \cdot g(t) for @@ -32,9 +32,9 @@ that f(t)Cg(t)logk(t)f(t) \le C \cdot g(t) \cdot \log^k(t) for some kk and all tt.

Occasionally, we will also use O(f(t))O(f(t)) (or one of the other symbols) as shorthand to manipulate function classes. For example, we might write O(f(t))+O(g(t))=O(f(t)+g(t))O(f(t)) + O(g(t)) = O(f(t) + g(t)) to mean that the sum of two -functions in O(f(t))O(f(t)) and O(g(t))O(g(t)) is in O(f(t)+g(t))O(f(t) + g(t)).

2Python

\ No newline at end of file diff --git a/background.json b/background.json index 5e64954..afd5fa6 100644 --- a/background.json +++ b/background.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"6ba74e7c87ad2f3efe8bff9f065ad2fd5a5d67073bd2088d6a7447e7ca5dbd90","slug":"background","location":"/background.md","dependencies":[],"frontmatter":{"title":"Appendix: Background","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"background.md","url":"/build/background-b9d91961500f82c612d4d450395301be.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"O notation","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"NeVey7ndjD"}],"identifier":"o-notation","label":"O notation","html_id":"o-notation","implicit":true,"enumerator":"1","key":"XvJTuxL23l"},{"type":"paragraph","position":{"start":{"line":18,"column":1},"end":{"line":19,"column":1}},"children":[{"type":"text","value":"Throughout this chapter and the rest of the book, we will describe the\nasymptotic behavior of a function using ","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"qqz3BLCfpT"},{"type":"inlineMath","value":"O","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"html":"OOO","key":"JizcdOfrLm"},{"type":"text","value":" notation.","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"xx457euc4e"}],"key":"CtTI4rs583"},{"type":"paragraph","position":{"start":{"line":21,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"For two functions ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"cFh1Mb9SmB"},{"type":"inlineMath","value":"f(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)f(t)f(t)","key":"CR0y8dPX1n"},{"type":"text","value":" and ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"TieX8hddjy"},{"type":"inlineMath","value":"g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"g(t)g(t)g(t)","key":"tqqmywSVE3"},{"type":"text","value":", we say that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"xY2S8uIWZY"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"lkJBbmNNuU"},{"type":"text","value":" if\n","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"kWaXGaK4qT"},{"type":"inlineMath","value":"f","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"fff","key":"kz1tTYEn67"},{"type":"text","value":" is asymptotically upper bounded by ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"TKswD9Z7hR"},{"type":"inlineMath","value":"g","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ggg","key":"pyFFTlhU7b"},{"type":"text","value":". Formally, this means that\nthere exists some constant ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"I6ZRqsYQ7V"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"C>0C > 0C>0","key":"U68AxpaEYZ"},{"type":"text","value":" such that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"JRvMtvxYhF"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"R7m4He6R8B"},{"type":"text","value":" for\nall ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"mxnWzMxL3V"},{"type":"inlineMath","value":"t","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ttt","key":"EL6ZsmQQ34"},{"type":"text","value":" past some point ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"izcE5Cfh5n"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"t0t_0t0","key":"OyfTGRjjBa"},{"type":"text","value":".","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"KetJ6KQ9oV"}],"key":"OFZw4cCX3l"},{"type":"paragraph","position":{"start":{"line":26,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"We say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"z4fwcczcV3"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"t5S0lFnpaH"},{"type":"text","value":" if asymptotically ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"k4Vt2Jh8JN"},{"type":"inlineMath","value":"f","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"fff","key":"OfcBiNkc56"},{"type":"text","value":" grows strictly slower than\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"HnyXr7FVkW"},{"type":"inlineMath","value":"g","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"ggg","key":"J7jbbEp7ZO"},{"type":"text","value":". Formally, this means that for ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"elcG0q0TVM"},{"type":"emphasis","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"LNGq6aVtFb"}],"key":"yf8IUVyn5l"},{"type":"text","value":" scalar ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"ysr8FEh77t"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"C>0C > 0C>0","key":"wWYqYh1kjD"},{"type":"text","value":", there exists\nsome ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"plBihLqjik"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t0t_0t0","key":"fK1S4VOvn9"},{"type":"text","value":" such that ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"akNL0cgDpS"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"uZo7edqGGU"},{"type":"text","value":" for all ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"ioWxZuFO8F"},{"type":"inlineMath","value":"t > t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t>t0t > t_0t>t0","key":"LPOvjWtZUa"},{"type":"text","value":".\nEquivalently, we say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"frofosGa1k"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"tw8al2psCr"},{"type":"text","value":" if\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"PsSNveFkt6"},{"type":"inlineMath","value":"\\lim_{t \\to \\infty} f(t)/g(t) = 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"limtf(t)/g(t)=0\\lim_{t \\to \\infty} f(t)/g(t) = 0limtf(t)/g(t)=0","key":"ByuHILQuFi"},{"type":"text","value":".","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"Gf9ihzUqzR"}],"key":"OKXKtYrrHl"},{"type":"paragraph","position":{"start":{"line":32,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"inlineMath","value":"f(t) = \\Theta(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)=Θ(g(t))f(t) = \\Theta(g(t))f(t)=Θ(g(t))","key":"J7vD9WAEKr"},{"type":"text","value":" means that ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"wzSvtWLg5g"},{"type":"inlineMath","value":"f","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"fff","key":"Q1FKRu5zhC"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"HODA83PJxe"},{"type":"inlineMath","value":"g","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"ggg","key":"mRBK61hqSb"},{"type":"text","value":" grow at the same rate\nasymptotically. That is, ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"bZRfvALg0l"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"DVdXTJuWR2"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"bEczbMotA3"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"fhki12ZVZa"},{"type":"text","value":".","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"H6nekdBPvs"}],"key":"uaYas17Lfv"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"Finally, we use ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"qFzBb7WxQq"},{"type":"inlineMath","value":"f(t) \\ge \\Omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)Ω(g(t))f(t) \\ge \\Omega(g(t))f(t)Ω(g(t))","key":"ly0CDvChxB"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"x9KjVa8fke"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"tFgqFjLisk"},{"type":"text","value":",\nand ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"cOnXZEMmoV"},{"type":"inlineMath","value":"f(t) > \\omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)>ω(g(t))f(t) > \\omega(g(t))f(t)>ω(g(t))","key":"SRDsWFKntQ"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"ISIs2SCD4y"},{"type":"inlineMath","value":"g(t) < o(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)<o(f(t))g(t) < o(f(t))g(t)<o(f(t))","key":"yYUOstAqh0"},{"type":"text","value":".","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"KAn7tGoiIH"}],"key":"pF3umHB7bE"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"We also use the notation ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"oOsmq5HP27"},{"type":"inlineMath","value":"\\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"O~(g(t))\\tilde O(g(t))O~(g(t))","key":"x4QHaCGbdn"},{"type":"text","value":" to hide logarithmic factors.\nThat is, ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"NVdjyCkWcY"},{"type":"inlineMath","value":"f(t) = \\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)=O~(g(t))f(t) = \\tilde O(g(t))f(t)=O~(g(t))","key":"nH9313noSD"},{"type":"text","value":" if there exists some constant ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"fvfuajq4QR"},{"type":"inlineMath","value":"C","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"CCC","key":"ftu9u5RSOF"},{"type":"text","value":" such\nthat ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"fXiT9dk3Jz"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)Cg(t)logk(t)f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)f(t)Cg(t)logk(t)","key":"CKYVB7bIvX"},{"type":"text","value":" for some ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"EKn0oHrhMi"},{"type":"inlineMath","value":"k","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"kkk","key":"IEUozjPwKN"},{"type":"text","value":" and all ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"bjbuxHmDtB"},{"type":"inlineMath","value":"t","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"ttt","key":"fJKwG6cxP2"},{"type":"text","value":".","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"DRYyF3iiH4"}],"key":"cDYvwV6wvp"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":45,"column":1}},"children":[{"type":"text","value":"Occasionally, we will also use ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Ma0LOAvJlt"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"KlhxAzXYOA"},{"type":"text","value":" (or one of the other symbols)\nas shorthand to manipulate function classes. For example, we might write\n","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hRlsKCVxgm"},{"type":"inlineMath","value":"O(f(t)) + O(g(t)) = O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))+O(g(t))=O(f(t)+g(t))O(f(t)) + O(g(t)) = O(f(t) + g(t))O(f(t))+O(g(t))=O(f(t)+g(t))","key":"i4FEYhY9LM"},{"type":"text","value":" to mean that the sum of two\nfunctions in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"vij1Nc2Pwn"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"YDb7t2EiRo"},{"type":"text","value":" and ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"j0ByzCc3Cu"},{"type":"inlineMath","value":"O(g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(g(t))O(g(t))O(g(t))","key":"ld52CIFlX2"},{"type":"text","value":" is in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"w3zZEpff7g"},{"type":"inlineMath","value":"O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t)+g(t))O(f(t) + g(t))O(f(t)+g(t))","key":"CQZpc5eTwj"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"kaJVkQs6aH"}],"key":"shlbcPo8Tl"},{"type":"heading","depth":2,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Python","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"CSTLQwfmIg"}],"identifier":"python","label":"Python","html_id":"python","implicit":true,"enumerator":"2","key":"ZvmFkAfoGO"}],"key":"UVwskOKs4l"}],"key":"qzFMBdy6OV"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"6ba74e7c87ad2f3efe8bff9f065ad2fd5a5d67073bd2088d6a7447e7ca5dbd90","slug":"background","location":"/background.md","dependencies":[],"frontmatter":{"title":"Appendix: Background","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"background.md","url":"/build/background-b9d91961500f82c612d4d450395301be.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"O notation","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"ZE2l18efxW"}],"identifier":"o-notation","label":"O notation","html_id":"o-notation","implicit":true,"enumerator":"1","key":"PCtNpx4VvU"},{"type":"paragraph","position":{"start":{"line":18,"column":1},"end":{"line":19,"column":1}},"children":[{"type":"text","value":"Throughout this chapter and the rest of the book, we will describe the\nasymptotic behavior of a function using ","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"qjEmqVdCZ5"},{"type":"inlineMath","value":"O","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"html":"OOO","key":"ud2jpWUYF0"},{"type":"text","value":" notation.","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"FFifi9wOcA"}],"key":"G4curk1Jqd"},{"type":"paragraph","position":{"start":{"line":21,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"For two functions ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"OWRhBxT6Wk"},{"type":"inlineMath","value":"f(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)f(t)f(t)","key":"xppeJHTZmo"},{"type":"text","value":" and ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"e8KnE1lV4J"},{"type":"inlineMath","value":"g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"g(t)g(t)g(t)","key":"aInJlbeUyF"},{"type":"text","value":", we say that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"OWv9JWCjU1"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"rRt8uO4SS6"},{"type":"text","value":" if\n","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"zPAELwhjLm"},{"type":"inlineMath","value":"f","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"fff","key":"q8hGrTRCS0"},{"type":"text","value":" is asymptotically upper bounded by ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"CqEUZGIC3f"},{"type":"inlineMath","value":"g","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ggg","key":"yKTzSh6eer"},{"type":"text","value":". Formally, this means that\nthere exists some constant ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"SILHgr3U5y"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"C>0C > 0C>0","key":"PAO3cqgur6"},{"type":"text","value":" such that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"lvBpZaGl44"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"H0satsSWrC"},{"type":"text","value":" for\nall ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"JomBlPG4A9"},{"type":"inlineMath","value":"t","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ttt","key":"fAI7kLDQ5o"},{"type":"text","value":" past some point ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"rUPCnpWyJS"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"t0t_0t0","key":"LIZEuh3Bhi"},{"type":"text","value":".","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"gGF035R7Am"}],"key":"P46aYU6gBT"},{"type":"paragraph","position":{"start":{"line":26,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"We say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"El8mZAXWAb"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"k3CORUizxK"},{"type":"text","value":" if asymptotically ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"dxmtVZd6KP"},{"type":"inlineMath","value":"f","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"fff","key":"s3zgn7rYtR"},{"type":"text","value":" grows strictly slower than\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"tkqhFOtZbc"},{"type":"inlineMath","value":"g","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"ggg","key":"mOz2slW9qE"},{"type":"text","value":". Formally, this means that for ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"oRNPq2nhYY"},{"type":"emphasis","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"Tg1j3JGYV6"}],"key":"PWndXiim8o"},{"type":"text","value":" scalar ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"TQat69uPPG"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"C>0C > 0C>0","key":"GBhh2pNRXH"},{"type":"text","value":", there exists\nsome ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"hR5ZqbSnKS"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t0t_0t0","key":"CZUj0DOYiT"},{"type":"text","value":" such that ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"HHAGAvOMqr"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"sdigEMD1G1"},{"type":"text","value":" for all ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"c7HirBZInK"},{"type":"inlineMath","value":"t > t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t>t0t > t_0t>t0","key":"cLZTgomjJy"},{"type":"text","value":".\nEquivalently, we say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"iViBAN47JJ"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"EXn8q2U9lo"},{"type":"text","value":" if\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"q90XAxqG1l"},{"type":"inlineMath","value":"\\lim_{t \\to \\infty} f(t)/g(t) = 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"limtf(t)/g(t)=0\\lim_{t \\to \\infty} f(t)/g(t) = 0limtf(t)/g(t)=0","key":"vmvW57pXwa"},{"type":"text","value":".","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"AEkyAFexPP"}],"key":"XBaihvshMR"},{"type":"paragraph","position":{"start":{"line":32,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"inlineMath","value":"f(t) = \\Theta(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)=Θ(g(t))f(t) = \\Theta(g(t))f(t)=Θ(g(t))","key":"F0PaKPl0Yk"},{"type":"text","value":" means that ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"RZIaBnQ5mT"},{"type":"inlineMath","value":"f","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"fff","key":"go0i6ISrNg"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"OOd2jSc2IB"},{"type":"inlineMath","value":"g","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"ggg","key":"xGrlJWTUfd"},{"type":"text","value":" grow at the same rate\nasymptotically. That is, ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"HJdIIk0bfz"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"CAJ4F8xlCE"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"TD6yXNLx6Y"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"ihAnR5Me7M"},{"type":"text","value":".","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"pN7X8v96IH"}],"key":"XmXi9CRNoS"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"Finally, we use ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"dqK3i5bAHd"},{"type":"inlineMath","value":"f(t) \\ge \\Omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)Ω(g(t))f(t) \\ge \\Omega(g(t))f(t)Ω(g(t))","key":"tCQU5ampsE"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"JQ9IXk41nB"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"IDe4J5Qfyv"},{"type":"text","value":",\nand ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"AjcPwAsRbw"},{"type":"inlineMath","value":"f(t) > \\omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)>ω(g(t))f(t) > \\omega(g(t))f(t)>ω(g(t))","key":"IwAlnB5WUK"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"cxIaUlzssY"},{"type":"inlineMath","value":"g(t) < o(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)<o(f(t))g(t) < o(f(t))g(t)<o(f(t))","key":"MMM9f84JwG"},{"type":"text","value":".","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"beW1x2JzoO"}],"key":"qCVFmqUDFa"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"We also use the notation ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"z3gJdlSGTo"},{"type":"inlineMath","value":"\\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"O~(g(t))\\tilde O(g(t))O~(g(t))","key":"TczJmvNemD"},{"type":"text","value":" to hide logarithmic factors.\nThat is, ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"fNZLsH0SDz"},{"type":"inlineMath","value":"f(t) = \\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)=O~(g(t))f(t) = \\tilde O(g(t))f(t)=O~(g(t))","key":"EfFxFmM7SC"},{"type":"text","value":" if there exists some constant ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"JcKTBE42b1"},{"type":"inlineMath","value":"C","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"CCC","key":"oLcPUTDyAC"},{"type":"text","value":" such\nthat ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"IqAUJNUI3p"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)Cg(t)logk(t)f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)f(t)Cg(t)logk(t)","key":"gLAu5rqoMf"},{"type":"text","value":" for some ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"IuK18WKeuz"},{"type":"inlineMath","value":"k","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"kkk","key":"QOTgfFGXQI"},{"type":"text","value":" and all ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"MOboX7joXc"},{"type":"inlineMath","value":"t","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"ttt","key":"BcU0W5ULBM"},{"type":"text","value":".","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"HCZIbKs1sy"}],"key":"cbsnkRdMWO"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":45,"column":1}},"children":[{"type":"text","value":"Occasionally, we will also use ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"fvWy8QKLYg"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"oAyQ4utOrB"},{"type":"text","value":" (or one of the other symbols)\nas shorthand to manipulate function classes. For example, we might write\n","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"uCEeYQxc11"},{"type":"inlineMath","value":"O(f(t)) + O(g(t)) = O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))+O(g(t))=O(f(t)+g(t))O(f(t)) + O(g(t)) = O(f(t) + g(t))O(f(t))+O(g(t))=O(f(t)+g(t))","key":"BNLw6ZHuBj"},{"type":"text","value":" to mean that the sum of two\nfunctions in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"pfLnsZM2oH"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"FnVvJQdvbX"},{"type":"text","value":" and ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"sReF9jyWpo"},{"type":"inlineMath","value":"O(g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(g(t))O(g(t))O(g(t))","key":"g3XbEfNJiT"},{"type":"text","value":" is in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"UU5reKN1oL"},{"type":"inlineMath","value":"O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t)+g(t))O(f(t) + g(t))O(f(t)+g(t))","key":"pWvh8pQGa7"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"m31WocarO4"}],"key":"mcxctIB268"},{"type":"heading","depth":2,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Python","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"MkKZaYrkVX"}],"identifier":"python","label":"Python","html_id":"python","implicit":true,"enumerator":"2","key":"v9R4kwnE3L"}],"key":"ofPW3M8a6a"}],"key":"GN7GmsB4AQ"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/bandits.html b/bandits.html index 8a9c095..ec56f3f 100644 --- a/bandits.html +++ b/bandits.html @@ -14,11 +14,11 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

3 Multi-Armed Bandits

3.1Introduction

The multi-armed bandits (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making. +In this setting, an agent repeatedly chooses from a fixed set of actions, called arms, each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.

In particular, we’ll spend a lot of time discussing the Exploration-Exploitation Tradeoff: should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?

In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.

from jaxtyping import Float, Array
 import numpy as np
 import latexify
 from typing import Callable, Union
@@ -40,7 +40,7 @@
     identifiers={"arm": "a_t", "reward": "r", "means": "mu"},
     use_math_symbols=True,
     escape_underscores=False,
-)

Let KK denote the number of arms. We’ll label them 0,,K10, \dots, K-1 and use superscripts to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the Bernoulli bandit setting from the examples above, where arm kk either returns reward 1 with probability μk\mu^k or 0 otherwise. The agent gets to pull an arm TT times in total. We can formalize the Bernoulli bandit in the following Python code:

class MAB:
+)

Let KK denote the number of arms. We’ll label them 0,,K10, \dots, K-1 and use superscripts to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the Bernoulli bandit setting from the examples above, where arm kk either returns reward 1 with probability μk\mu^k or 0 otherwise. The agent gets to pull an arm TT times in total. We can formalize the Bernoulli bandit in the following Python code:

class MAB:
     """
     The Bernoulli multi-armed bandit environment.
 
@@ -58,8 +58,8 @@
     def pull(self, k: int) -> int:
         """Pull the `k`-th arm and sample from its (Bernoulli) reward distribution."""
         reward = np.random.rand() < self.means[k].item()
-        return +reward
mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)

In pseudocode, the agent’s interaction with the MAB environment can be -described by the following process:

@latex
+        return +reward
mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)

In pseudocode, the agent’s interaction with the MAB environment can be +described by the following process:

@latex
 def mab_loop(mab: MAB, agent: "Agent") -> int:
     for t in range(mab.T):
         arm = agent.choose_arm()  # in 0, ..., K-1
@@ -67,7 +67,7 @@
         agent.update_history(arm, reward)
 
 
-mab_loop
Loading...

The Agent class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a NK×2\mathbb{N}^{K \times 2} array.

class Agent:
+mab_loop
Loading...

The Agent class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a NK×2\mathbb{N}^{K \times 2} array.

class Agent:
     def __init__(self, K: int, T: int):
         """The MAB agent that decides how to choose an arm given the past history."""
         self.K = K
@@ -87,12 +87,12 @@
     def update_history(self, arm: int, reward: int):
         self.rewards.append(reward)
         self.choices.append(arm)
-        self.history[arm, reward] += 1

What’s the optimal strategy for the agent, i.e. the one that achieves + self.history[arm, reward] += 1

What’s the optimal strategy for the agent, i.e. the one that achieves the highest expected reward? Convince yourself that the agent should try -to always pull the arm with the highest expected reward:

μ:=maxk[K]μk.\mu^\star := \max_{k \in [K]} \mu^k.

The goal, then, can be rephrased as to minimize the regret, defined -below:

def regret_per_step(mab: MAB, agent: Agent):
+to always pull the arm with the highest expected reward:

μ:=maxk[K]μk.\mu^\star := \max_{k \in [K]} \mu^k.

The goal, then, can be rephrased as to minimize the regret, defined +below:

def regret_per_step(mab: MAB, agent: Agent):
     """Get the difference from the average reward of the optimal arm. The sum of these is the regret."""
-    return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]

Note that this depends on the true means of the pulled arms, not the actual + return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]

Note that this depends on the true means of the pulled arms, not the actual observed rewards. We typically think of this as a random variable where the randomness comes from the agent’s strategy (i.e. the sequence of @@ -100,7 +100,7 @@ algorithms in two different senses:

  1. Upper bound the expected regret, i.e. show E[RegretT]MT\E[\text{Regret}_T] \le M_T.

  2. Find a high-probability upper bound on the regret, i.e. show P(RegretTMT,δ)1δ\pr(\text{Regret}_T \le M_{T, \delta}) \ge 1-\delta.

Note that these two different approaches say very different things about the regret. The first approach says that the average regret is at most MTM_T. However, the agent might still achieve higher regret on many runs. The second approach says that, with high probability, the agent will achieve regret at most MT,δM_{T, \delta}. However, it doesn’t say anything about the regret in the remaining δ fraction of runs, which might be arbitrarily high.

We’d like to achieve sublinear regret in expectation, i.e. E[RegretT]=o(T)\E[\text{Regret}_T] = o(T). That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.

The rest of the chapter comprises a series of increasingly sophisticated -MAB algorithms.

def plot_strategy(mab: MAB, agent: Agent):
+MAB algorithms.

def plot_strategy(mab: MAB, agent: Agent):
     plt.figure(figsize=(10, 6))
 
     # plot reward and cumulative regret
@@ -117,21 +117,21 @@
     plt.xlabel("timestep")
     plt.legend()
     plt.title(f"{agent.__class__.__name__} reward and regret")
-    plt.show()

3.2Pure exploration (random guessing)

A trivial strategy is to always choose arms at random (i.e. “pure -exploration”).

class PureExploration(Agent):
+    plt.show()

3.2Pure exploration (random guessing)

A trivial strategy is to always choose arms at random (i.e. “pure +exploration”).

class PureExploration(Agent):
     def choose_arm(self):
         """Choose an arm uniformly at random."""
-        return solutions.pure_exploration_choose_arm(self)

Note that

EatUnif([K])[μat]=μˉ=1Kk=1Kμk\E_{a_t \sim \text{Unif}([K])}[\mu^{a_t}] = \bar \mu = \frac{1}{K} \sum_{k=1}^K \mu^k

so the expected regret is simply

E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\begin{aligned} + return solutions.pure_exploration_choose_arm(self)

Note that

EatUnif([K])[μat]=μˉ=1Kk=1Kμk\E_{a_t \sim \text{Unif}([K])}[\mu^{a_t}] = \bar \mu = \frac{1}{K} \sum_{k=1}^K \mu^k

so the expected regret is simply

E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\begin{aligned} \E[\text{Regret}_T] &= \sum_{t=0}^{T-1} \E[\mu^\star - \mu^{a_t}] \\ &= T (\mu^\star - \bar \mu) > 0. -\end{aligned}

This scales as Θ(T)\Theta(T), i.e. linear in the number of timesteps TT. There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.

agent = PureExploration(mab.K, mab.T)
+\end{aligned}

This scales as Θ(T)\Theta(T), i.e. linear in the number of timesteps TT. There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.

agent = PureExploration(mab.K, mab.T)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

3.3Pure greedy

How might we improve on pure exploration? Instead, we could try each arm +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

3.3Pure greedy

How might we improve on pure exploration? Instead, we could try each arm once, and then commit to the one with the highest observed reward. We’ll -call this the pure greedy strategy.

class PureGreedy(Agent):
+call this the pure greedy strategy.

class PureGreedy(Agent):
     def choose_arm(self):
         """Choose the arm with the highest observed reward on its first pull."""
-        return solutions.pure_greedy_choose_arm(self)

Note we’ve used superscripts rkr^k during the exploration phase to + return solutions.pure_greedy_choose_arm(self)

Note we’ve used superscripts rkr^k during the exploration phase to indicate that we observe exactly one reward for each arm. Then we use subscripts rtr_t during the exploitation phase to indicate that we observe a sequence of rewards from the chosen greedy arm k^\hat k.

How does the expected regret of this strategy compare to that of pure @@ -140,31 +140,31 @@ reward distributions with means μ0>μ1\mu^0 > \mu^1.

Let’s let r0r^0 be the random reward from the first arm and r1r^1 be the random reward from the second. If r0>r1r^0 > r^1, then we achieve zero regret. Otherwise, we achieve regret T(μ0μ1)T(\mu^0 - \mu^1). Thus, the -expected regret is simply:

E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\begin{aligned} +expected regret is simply:

E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\begin{aligned} \E[\text{Regret}_T] &= \pr(r^0 < r^1) \cdot T(\mu^0 - \mu^1) + c \\ &= (1 - \mu^0) \mu^1 \cdot T(\mu^0 - \mu^1) + c -\end{aligned}

Which is still Θ(T)\Theta(T), the same as pure exploration!

agent = PureGreedy(mab.K, mab.T)
+\end{aligned}

Which is still Θ(T)\Theta(T), the same as pure exploration!

agent = PureGreedy(mab.K, mab.T)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its average regret is what measures its effectiveness.

3.4Explore-then-commit

We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm Nexplore>1N_{\text{explore}}> 1 times before committing. This is called the explore-then-commit strategy. Note that the “pure greedy” strategy above is just the special case where -Nexplore=1N_{\text{explore}}= 1.

class ExploreThenCommit(Agent):
+plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its average regret is what measures its effectiveness.

3.4Explore-then-commit

We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm Nexplore>1N_{\text{explore}}> 1 times before committing. This is called the explore-then-commit strategy. Note that the “pure greedy” strategy above is just the special case where +Nexplore=1N_{\text{explore}}= 1.

class ExploreThenCommit(Agent):
     def __init__(self, K: int, T: int, N_explore: int):
         super().__init__(K, T)
         self.N_explore = N_explore
 
     def choose_arm(self):
-        return solutions.etc_choose_arm(self)
agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)
+        return solutions.etc_choose_arm(self)
agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?

3.4.1ETC regret analysis

Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?

3.4.1ETC regret analysis

Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up into the exploration and exploitation phases.

3.4.1.1Exploration phase.

This phase takes NexploreKN_{\text{explore}}K timesteps. Since at each step we incur at most 1 regret, the total regret is at most NexploreKN_{\text{explore}}K.

3.4.1.2Exploitation phase.

This will take a bit more effort. We’ll prove that for any total time TT, we can choose NexploreN_{\text{explore}} such that with arbitrarily high probability, the regret is sublinear.

Let k^\hat k denote the arm chosen after the exploration phase. We know the regret from the -exploitation phase is

Texploit(μμk^)whereTexploit:=TNexploreK.T_{\text{exploit}} (\mu^\star - \mu^{\hat k}) \qquad \text{where} \qquad T_{\text{exploit}} := T - N_{\text{explore}}K.

So we’d like to bound μμk^=o(1)\mu^\star - \mu^{\hat k} = o(1) (as a function +exploitation phase is

Texploit(μμk^)whereTexploit:=TNexploreK.T_{\text{exploit}} (\mu^\star - \mu^{\hat k}) \qquad \text{where} \qquad T_{\text{exploit}} := T - N_{\text{explore}}K.

So we’d like to bound μμk^=o(1)\mu^\star - \mu^{\hat k} = o(1) (as a function of TT) in order to achieve sublinear regret. How can we do this?

Let’s define Δk=μ^kμk\Delta^k = \hat \mu^k - \mu^k to denote how far the mean estimate for arm kk is from the true mean. How can we bound this quantity? We’ll use the following useful inequality for i.i.d. bounded random variables:

The proof of this inequality is beyond the scope of this book. See Vershynin (2018) Chapter 2.2.

We can apply this directly to the rewards for a given arm kk, since the rewards from that arm are i.i.d.:

P(Δk>ln(2/δ)2Nexplore)δ.\pr\left(|\Delta^k | > \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) \le \delta.

The proof of this inequality is beyond the scope of this book. See Vershynin (2018) Chapter 2.2.

We can apply this directly to the rewards for a given arm kk, since the rewards from that arm are i.i.d.:

P(Δk>ln(2/δ)2Nexplore)δ.\pr\left(|\Delta^k | > \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) \le \delta.1

Then to apply this bound to k^\hat k in particular, we -can apply the useful trick of “adding zero”:

μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\begin{aligned} +606zM1001 80h400000v40H1017.7z'/>)1

Then to apply this bound to k^\hat k in particular, we +can apply the useful trick of “adding zero”:

μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\begin{aligned} \mu^{k^\star} - \mu^{\hat k} &= \mu^{k^\star} - \mu^{\hat k} + (\hat \mu^{k^\star} - \hat \mu^{k^\star}) + (\hat \mu^{\hat k} - \hat \mu^{\hat k}) \\ &= \Delta^{\hat k} - \Delta^{k^*} + \underbrace{(\hat \mu^{k^\star} - \hat \mu^{\hat k})}_{\le 0 \text{ by definition of } \hat k} \\ &\le 2 \sqrt{\frac{\ln(2K/\delta')}{2N_{\text{explore}}}} \text{ with probability at least } 1-\delta' @@ -216,15 +216,15 @@ c-8,0,-12,-0.7,-12,-2c0,-1.3,-5.3,-32,-16,-92c-50.7,-293.3,-119.7,-693.3,-207,-1200 c0,-1.3,-5.3,8.7,-16,30c-10.7,21.3,-21.3,42.7,-32,64s-16,33,-16,33s-26,-26,-26,-26 s76,-153,76,-153s77,-151,77,-151c0.7,0.7,35.7,202,105,604c67.3,400.7,102,602.7,104, -606zM1001 80h400000v40H1017.7z'/> with probability at least 1δ

where we’ve set δ=Kδ\delta' = K\delta. Putting this all -together, we’ve shown that, with probability 1δ1 - \delta',

RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\text{Regret}_T \le N_{\text{explore}}K + T_{\text{exploit}} \cdot \sqrt{\frac{2\ln(2K/\delta')}{N_{\text{explore}}}}.

where we’ve set δ=Kδ\delta' = K\delta. Putting this all +together, we’ve shown that, with probability 1δ1 - \delta',

RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\text{Regret}_T \le N_{\text{explore}}K + T_{\text{exploit}} \cdot \sqrt{\frac{2\ln(2K/\delta')}{N_{\text{explore}}}}.

Note that it suffices for NexploreN_{\text{explore}} to be on the order of +606zM1001 80h400000v40H1017.7z'/>.

Note that it suffices for NexploreN_{\text{explore}} to be on the order of T\sqrt{T} to achieve sublinear regret. In particular, we can find the optimal NexploreN_{\text{explore}} by setting the derivative of the r.h.s. to -zero:

0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\begin{aligned} +zero:

0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\begin{aligned} 0 &= K - T_{\text{exploit}} \cdot \frac{1}{2} \sqrt{\frac{2\ln(2K/\delta')}{N_{\text{explore}}^3}} \\ N_{\text{explore}}&= \left( T_{\text{exploit}} \cdot \frac{\sqrt{\ln(2K/\delta')/2}}{K} \right)^{2/3} \end{aligned}

Plugging this into the expression for the regret, we -have (still with probability 1δ1-\delta')

RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\begin{aligned} +M1001 80h400000v40h-400000z'/>)2/3

Plugging this into the expression for the regret, we +have (still with probability 1δ1-\delta')

RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\begin{aligned} \text{Regret}_T &\le 3 T^{2/3} \sqrt[3]{K \ln(2K/\delta') / 2} \\ &= \tilde{O}(T^{2/3} K^{1/3}). \end{aligned}

The ETC algorithm is rather “abrupt” in that it switches from +M1001 80h400000v40h-400000z'/>=O~(T2/3K1/3).

The ETC algorithm is rather “abrupt” in that it switches from exploration to exploitation after a fixed number of timesteps. In practice, it’s often better to use a more gradual transition, which -brings us to the epsilon-greedy algorithm.

3.5Epsilon-greedy

Instead of doing all of the exploration and then all of the exploitation +brings us to the epsilon-greedy algorithm.

3.5Epsilon-greedy

Instead of doing all of the exploration and then all of the exploitation separately – which additionally requires knowing the time horizon beforehand – we can instead interleave exploration and exploitation by, at each timestep, choosing a random action with some probability. We -call this the epsilon-greedy algorithm.

class EpsilonGreedy(Agent):
+call this the epsilon-greedy algorithm.

class EpsilonGreedy(Agent):
     def __init__(
         self,
         K: int,
@@ -290,9 +290,9 @@
         self.ε_array = ε_array
 
     def choose_arm(self):
-        return solutions.epsilon_greedy_choose_arm(self)
agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))
+        return solutions.epsilon_greedy_choose_arm(self)
agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

Note that we let ε vary over time. In particular, we might want to gradually decrease ε as we learn more about the reward distributions and no longer need to spend time exploring.

It turns out that setting ϵt=Kln(t)/t3\epsilon_t = \sqrt[3]{K \ln(t)/t} also achieves a regret of O~(t2/3K1/3)\tilde O(t^{2/3} K^{1/3}) (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION

In ETC, we had to set NexploreN_{\text{explore}} based on the total number of timesteps TT. But the epsilon-greedy algorithm actually handles the exploration automatically: the regret rate holds for any tt, and doesn’t depend on the final horizon TT.

But the way these algorithms explore is rather naive: we’ve been exploring uniformly across all the arms. But what if we could be smarter about it, and explore more for arms that we’re less certain about?

3.6Upper Confidence Bound (UCB)

To quantify how certain we are about the mean of each arm, we’ll +M1001 80h400000v40h-400000z'/> also achieves a regret of O~(t2/3K1/3)\tilde O(t^{2/3} K^{1/3}) (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION

In ETC, we had to set NexploreN_{\text{explore}} based on the total number of timesteps TT. But the epsilon-greedy algorithm actually handles the exploration automatically: the regret rate holds for any tt, and doesn’t depend on the final horizon TT.

But the way these algorithms explore is rather naive: we’ve been exploring uniformly across all the arms. But what if we could be smarter about it, and explore more for arms that we’re less certain about?

3.6Upper Confidence Bound (UCB)

To quantify how certain we are about the mean of each arm, we’ll compute confidence intervals for our estimators, and then choose the arm with the highest upper confidence bound. This operates on the principle of the benefit of the doubt (i.e. optimism in the face of @@ -320,10 +320,10 @@ uniformly across all timesteps and arms. Let’s introduce some notation to discuss this.

Let NtkN^k_t denote the (random) number of times arm kk has been pulled within the first tt timesteps, and μ^tk\hat \mu^k_t denote the sample -average of those pulls. That is,

Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\begin{aligned} +average of those pulls. That is,

Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\begin{aligned} N^k_t &:= \sum_{\tau=0}^{t-1} \mathbf{1} \{ a_\tau = k \} \\ \hat \mu^k_t &:= \frac{1}{N^k_t} \sum_{\tau=0}^{t-1} \mathbf{1} \{ a_\tau = k \} r_\tau. -\end{aligned}

To achieve the “fixed sample size” assumption, we’ll +\end{aligned}

To achieve the “fixed sample size” assumption, we’ll need to shift our index from time to number of samples from each arm. In particular, we’ll define r~nk\tilde r^k_n to be the nnth sample from arm kk, and μ~nk\tilde \mu^k_n to be the sample average of the first @@ -333,7 +333,7 @@ Well, we know NtktN^k_t \le t (where equality would be the case if and only if we had pulled arm kk every time). So we can apply the same trick as last time, where we uniform-ize across all possible values of -NtkN^k_t:

P(nt,μ~nkμkln(2/δ)2n)1tδ.\begin{aligned} +NtkN^k_t:

P(nt,μ~nkμkln(2/δ)2n)1tδ.\begin{aligned} \pr\left( \forall n \le t, |\tilde \mu^k_n - \mu^k | \le \sqrt{\frac{\ln(2/\delta)}{2n}} \right) &\ge 1-t\delta. \end{aligned}

In particular, since NtktN^k_t \le t, and μ~Ntkk=μ^tk\tilde \mu^k_{N^k_t} = \hat \mu^k_t by definition, we have

P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\begin{aligned} +h400000v40h-400000z'/>)1tδ.

In particular, since NtktN^k_t \le t, and μ~Ntkk=μ^tk\tilde \mu^k_{N^k_t} = \hat \mu^k_t by definition, we have

P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\begin{aligned} \pr\left( |\hat \mu^k_t - \mu^k | \le \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}} \right) &\ge 1-\delta' \text{ where } \delta' := t \delta. \end{aligned}

This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm kk would be

Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \hat \mu^k_t + \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}},1δ where δ:=tδ.

This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm kk would be

Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \hat \mu^k_t + \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}},

where we can choose δ\delta' depending on how tight we want the interval to be.

  • A smaller δ\delta' would give us a larger and higher-confidence interval, emphasizing the exploration term.
  • A larger δ\delta' would give a tighter and lower-confidence interval, prioritizing the current sample averages.

We can now use this to define the UCB algorithm.

class UCB(Agent):
+606zM1001 80h400000v40H1017.7z'/>,

where we can choose δ\delta' depending on how tight we want the interval to be.

  • A smaller δ\delta' would give us a larger and higher-confidence interval, emphasizing the exploration term.
  • A larger δ\delta' would give a tighter and lower-confidence interval, prioritizing the current sample averages.

We can now use this to define the UCB algorithm.

class UCB(Agent):
     def __init__(self, K: int, T: int, delta: float):
         super().__init__(K, T)
         self.delta = delta
 
     def choose_arm(self):
-        return solutions.ucb_choose_arm(self)

Intuitively, UCB prioritizes arms where:

  1. μ^tk\hat \mu^k_t is large, i.e. the arm has a high sample average, and + return solutions.ucb_choose_arm(self)

Intuitively, UCB prioritizes arms where:

  1. μ^tk\hat \mu^k_t is large, i.e. the arm has a high sample average, and we’d choose it for exploitation, and

  2. ln(2t/δ)2Ntk\sqrt{\frac{\ln(2t/\delta')}{2N^k_t}} is large, i.e. we’re still uncertain about the arm, and we’d choose it for exploration.

As desired, this explores in a smarter, adaptive way compared to the -previous algorithms. Does it achieve lower regret?

agent = UCB(mab.K, mab.T, 0.9)
+previous algorithms. Does it achieve lower regret?

agent = UCB(mab.K, mab.T, 0.9)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

3.6.1UCB regret analysis

First we’ll bound the regret incurred at each timestep. Then we’ll bound +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

3.6.1UCB regret analysis

First we’ll bound the regret incurred at each timestep. Then we’ll bound the total regret across timesteps.

For the sake of analysis, we’ll use a slightly looser bound that applies across the whole time horizon and across all arms. We’ll omit the derivation since it’s very similar to the above (walk through it -yourself for practice).

P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\begin{aligned} +yourself for practice).

P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\begin{aligned} \pr\left(\forall k \le K, t < T. |\hat \mu^k_t - \mu^k | \le B^k_t \right) &\ge 1-\delta'' \\ \text{where} \quad B^k_t &:= \sqrt{\frac{\ln(2TK/\delta'')}{2N^k_t}}. \end{aligned}

Intuitively, BtkB^k_t denotes the width of the CI for arm kk at time +606zM1001 80h400000v40H1017.7z'/>.

Intuitively, BtkB^k_t denotes the width of the CI for arm kk at time tt. Then, assuming the above uniform bound holds (which occurs with probability 1δ1-\delta''), we can bound the regret at each timestep as -follows:

μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\begin{aligned} +follows:

μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\begin{aligned} \mu^\star - \mu^{a_t} &\le \hat \mu^{k^*}_t + B_t^{k^*} - \mu^{a_t} && \text{applying UCB to arm } k^\star \\ &\le \hat \mu^{a_t}_t + B^{a_t}_t - \mu^{a_t} && \text{since UCB chooses } a_t = \arg \max_{k \in [K]} \hat \mu^k_t + B_t^{k} \\ &\le 2 B^{a_t}_t && \text{since } \hat \mu^{a_t}_t - \mu^{a_t} \le B^{a_t}_t \text{ by definition of } B^{a_t}_t \\ -\end{aligned}

Summing this across timesteps gives

RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\begin{aligned} +\end{aligned}

Summing this across timesteps gives

RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\begin{aligned} \text{Regret}_T &\le \sum_{t=0}^{T-1} 2 B^{a_t}_t \\ &= \sqrt{2\ln(2TK/\delta'')} \sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\ \sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \sum_{t=0}^{T-1} \sum_{k=1}^K \mathbf{1}\{ a_t = k \} (N^k_t)^{-1/2} \\ @@ -455,7 +455,7 @@ H400000v40H845.2724 s-225.272,467,-225.272,467s-235,486,-235,486c-2.7,4.7,-9,7,-19,7 c-6,0,-10,-1,-12,-3s-194,-422,-194,-422s-65,47,-65,47z -M834 80h400000v40h-400000z'/>

Putting everything together gives

RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\begin{aligned} +M834 80h400000v40h-400000z'/>

Putting everything together gives

RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\begin{aligned} \text{Regret}_T &\le 2 K \sqrt{2T \ln(2TK/\delta'')} && \text{with probability } 1-\delta'' \\ &= \tilde O(K\sqrt{T}) \end{aligned}

In fact, we can do a more sophisticated analysis to trim off a factor of K\sqrt{K}with probability 1δ′′

In fact, we can do a more sophisticated analysis to trim off a factor of K\sqrt{K}.

3.6.2Lower bound on regret (intuition)

Is it possible to do better than Ω(T)\Omega(\sqrt{T}).

3.6.2Lower bound on regret (intuition)

Is it possible to do better than Ω(T)\Omega(\sqrt{T}), we won’t be able to confidently tell them apart, and will sample them about -equally. But then we’ll incur regret

Ω((T/2)(1/T))=Ω(T).\Omega((T/2) \cdot (1/\sqrt{T})) = \Omega(\sqrt{T}).

3.7Thompson sampling and Bayesian bandits

So far, we’ve treated the parameters μ0,,μK1\mu^0, \dots, \mu^{K-1} of the +M834 80h400000v40h-400000z'/>).

3.7Thompson sampling and Bayesian bandits

So far, we’ve treated the parameters μ0,,μK1\mu^0, \dots, \mu^{K-1} of the reward distributions as fixed. Instead, we can take a Bayesian approach where we treat them as random variables from some prior distribution. Then, upon pulling an arm and observing a reward, we can @@ -585,14 +585,14 @@ posterior distribution over the parameters. This fully describes the information we gain about the parameters from observing the reward.

From this Bayesian perspective, the Thompson sampling algorithm follows naturally: just sample from the distribution of the optimal arm, -given the observations!

class Distribution:
+given the observations!

class Distribution:
     def sample(self) -> Float[Array, " K"]:
         """Sample a vector of means for the K arms."""
         ...
 
     def update(self, arm: int, reward: float):
         """Condition on obtaining `reward` from the given arm."""
-        ...
class ThompsonSampling(Agent):
+        ...
class ThompsonSampling(Agent):
     def __init__(self, K: int, T: int, prior: Distribution):
         super().__init__(K, T)
         self.distribution = prior
@@ -603,18 +603,18 @@
 
     def update_history(self, arm: int, reward: int):
         super().update_history(arm, reward)
-        self.distribution.update(arm, reward)

In other words, we sample each arm proportionally to how likely we think + self.distribution.update(arm, reward)

In other words, we sample each arm proportionally to how likely we think it is to be optimal, given the observations so far. This strikes a good exploration-exploitation tradeoff: we explore more for arms that we’re less certain about, and exploit more for arms that we’re more certain about. Thompson sampling is a simple yet powerful algorithm that -achieves state-of-the-art performance in many settings.

class Beta(Distribution):
+the entire posterior distribution from scratch.

class Beta(Distribution):
     def __init__(self, K: int, alpha: int = 1, beta: int = 1):
         self.alphas = np.full(K, alpha)
         self.betas = np.full(K, beta)
@@ -632,16 +632,16 @@
 
     def update(self, arm: int, reward: int):
         self.alphas[arm] += reward
-        self.betas[arm] += 1 - reward
beta_distribution = Beta(mab.K)
+        self.betas[arm] += 1 - reward
beta_distribution = Beta(mab.K)
 agent = ThompsonSampling(mab.K, mab.T, beta_distribution)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

It turns out that asymptotically, Thompson sampling is optimal in the +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

It turns out that asymptotically, Thompson sampling is optimal in the following sense. Lai & Robbins (1985) prove an -instance-dependent lower bound that says for any bandit algorithm,

lim infTE[NTk]ln(T)1KL(μkμ)\liminf_{T \to \infty} \frac{\E[N_T^k]}{\ln(T)} \ge \frac{1}{\text{KL}(\mu^k \parallel \mu^\star)}

where

KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\text{KL}(\mu^k \parallel \mu^\star) = \mu^k \ln \frac{\mu^k}{\mu^\star} + (1 - \mu^k) \ln \frac{1 - \mu^k}{1 - \mu^\star}

measures the Kullback-Leibler divergence from the Bernoulli +instance-dependent lower bound that says for any bandit algorithm,

lim infTE[NTk]ln(T)1KL(μkμ)\liminf_{T \to \infty} \frac{\E[N_T^k]}{\ln(T)} \ge \frac{1}{\text{KL}(\mu^k \parallel \mu^\star)}

where

KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\text{KL}(\mu^k \parallel \mu^\star) = \mu^k \ln \frac{\mu^k}{\mu^\star} + (1 - \mu^k) \ln \frac{1 - \mu^k}{1 - \mu^\star}

measures the Kullback-Leibler divergence from the Bernoulli distribution with mean μk\mu^k to the Bernoulli distribution with mean μ\mu^\star. It turns out that Thompson sampling achieves this lower bound with equality! That is, not only is the error rate optimal, but -the constant factor is optimal as well.

3.8Contextual bandits

In the above MAB environment, the reward distributions of the arms +the constant factor is optimal as well.

3.8Contextual bandits

In the above MAB environment, the reward distributions of the arms remain constant. However, in many real-world settings, we might receive additional information that affects these distributions. For example, in the online advertising case where each arm corresponds to an ad we could @@ -652,7 +652,7 @@ to observe the context, and choose an action ata_t according to some context-dependent policy πt(xt)\pi_t(x_t). Then, the learner observes the reward from the chosen arm rtνat(xt)r_t \sim \nu^{a_t}(x_t). The reward -distribution also depends on the context.

Assuming our context is discrete, we can just perform the same +distribution also depends on the context.

Assuming our context is discrete, we can just perform the same algorithms, treating each context-arm pair as its own arm. This gives us an enlarged MAB of KXK |\mathcal{X}| arms.

Recall that running UCB for TT timesteps on an MAB with KK arms @@ -686,13 +686,13 @@ unrelated to each other, while in practice, often contexts are related to each other in some way: for example, we might want to advertise similar products to users with similar preferences. How can we -incorporate this structure into our solution?

3.8.1Linear contextual bandits

We want to model the mean reward of arm kk as a function of the +incorporate this structure into our solution?

3.8.1Linear contextual bandits

We want to model the mean reward of arm kk as a function of the context, i.e. μk(x)\mu^k(x). One simple model is the linear one: μk(x)=xθk\mu^k(x) = x^\top \theta^k, where xX=Rdx \in \mathcal{X} = \mathbb{R}^d and θkRd\theta^k \in \mathbb{R}^d describes a feature direction for arm kk. Recall that supervised learning gives us a way to estimate a conditional expectation from samples: We learn a least squares estimator from the -timesteps where arm kk was selected:

θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\hat \theta_t^k = \arg\min_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.

This has the closed-form solution known as the ordinary least squares +timesteps where arm kk was selected:

θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\hat \theta_t^k = \arg\min_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.

This has the closed-form solution known as the ordinary least squares (OLS) estimator:

θ^tk=(Atk)1{i[t]:ai=k}xiriwhereAtk={i[t]:ai=k}xixi.\begin{aligned} \hat \theta_t^k & = (A_t^k)^{-1} \sum_{\{ i \in [t] : a_i = k \}} x_i r_i \\ \text{where} \quad A_t^k & = \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top. @@ -704,9 +704,9 @@ sample mean, our estimator, from the true mean. However, now our estimator is not a sample mean, but rather the OLS estimator above (3.30). Instead, we’ll use Chebyshev’s inequality to construct an upper confidence bound.

Since the OLS estimator is known to be unbiased (try proving this +EY=0\E Y = 0 and EY2=σ2\E Y^2 = \sigma^2,

Yβσwith probability11β2|Y| \le \beta \sigma \quad \text{with probability} \ge 1 - \frac{1}{\beta^2}

Since the OLS estimator is known to be unbiased (try proving this yourself), we can apply Chebyshev’s inequality to -xt(θ^tkθk)x_t^\top (\hat \theta_t^k - \theta^k):

xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\begin{aligned} +xt(θ^tkθk)x_t^\top (\hat \theta_t^k - \theta^k):

xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\begin{aligned} x_t^\top \theta^k \le x_t^\top \hat \theta_t^k + \beta \sqrt{x_t^\top (A_t^k)^{-1} x_t} \quad \text{with probability} \ge 1 - \frac{1}{\beta^2} \end{aligned}

The first term is exactly our predicted reward μ^tk(xt)\hat \mu^k_t(x_t). To -interpret the second term, note that

xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,

where

Σtk=1Ntk{i[t]:ai=k}xixi\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top

is the empirical covariance matrix of the contexts (assuming that the +interpret the second term, note that

xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,

where

Σtk=1Ntk{i[t]:ai=k}xixi\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top

is the empirical covariance matrix of the contexts (assuming that the context has mean zero). That is, the learner is encouraged to choose arms when xtx_t is not aligned with the data seen so far, or if arm kk has not been explored much and so NtkN_t^k is small.

We can now substitute these quantities into UCB to get the LinUCB -algorithm:

class LinUCBPseudocode(Agent):
+algorithm:

class LinUCBPseudocode(Agent):
     def __init__(
         self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]
     ):
@@ -746,7 +746,7 @@
     def update_history(self, context: Float[Array, " D"], arm: int, reward: int):
         self.A[arm] += np.outer(context, context)
         self.targets[arm] += context * reward
-        self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])

ctc_t is similar to the log(2t/δ)\log (2t/\delta') term of UCB: It controls the + self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])

ctc_t is similar to the log(2t/δ)\log (2t/\delta') term of UCB: It controls the width of the confidence interval. Here, we treat it as a tunable parameter, though in a theoretical analysis, it would depend on AtkA_t^k and the probability δ with which the bound holds.

Using similar tools for UCB, we can also prove an O~(T)\tilde{O}(\sqrt{T}) regret bound. The full details of the analysis can be found in Section 3 of Agarwal et al. (2022).

3.9Summary

In this chapter, -we explored the multi-armed bandit setting for analyzing sequential decision-making in an unknown environment.

References
  1. Vershynin, R. (2018). High-Dimensional Probability: An Introduction with Applications in Data Science. Cambridge University Press.
  2. Lai, T. L., & Robbins, H. (1985). Asymptotically Efficient Adaptive Allocation Rules. Advances in Applied Mathematics, 6(1), 4–22. 10.1016/0196-8858(85)90002-8
  3. Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms.
\ No newline at end of file diff --git a/bandits.json b/bandits.json index 2657b1e..69cbce9 100644 --- a/bandits.json +++ b/bandits.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"cb8437494713e13080ce9e296ca5fbb4d04ebda213c523132d19db6324b795e6","slug":"bandits","location":"/bandits.md","dependencies":[],"frontmatter":{"title":"3 Multi-Armed Bandits","numbering":{"all":{"enabled":true},"enumerator":{"template":"3.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"bandits.md","url":"/build/bandits-edc5c0bbc4c299ec710273a0eb78717a.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"ijCddxDMcG"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"3.1","key":"H7RGl8KEEL"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Of7adM7xax"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"OmlTUr1cYd"}],"key":"vHxbCqgmlw"},{"type":"text","value":" (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making.\nIn this setting, an agent repeatedly chooses from a fixed set of actions, called ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"fN6rvkLxJb"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"arms","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"yHl1Iyr9n7"}],"key":"Pq06GNrJq0"},{"type":"text","value":", each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ymRHJ6a3Bc"}],"key":"idnCzVdKoN"},{"type":"comment","value":" \n| States | Actions | Rewards |\n| :----: | :-----: | :---------------------------------: |\n| None | Finite | $\\mathcal{A} \\to \\triangle([0, 1])$ |\n","key":"ih2QUrS0ee"},{"type":"paragraph","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"In particular, we’ll spend a lot of time discussing the ","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"D6iW63KX2l"},{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Exploration-Exploitation Tradeoff","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"VZ1376t7yh"}],"key":"dYTV6onDHD"},{"type":"text","value":": should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"e6hp2T99AM"}],"key":"xwSqUtVC1Y"},{"type":"proof","kind":"example","label":"advertising","identifier":"advertising","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Online advertising","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"epUnUXGt2C"}],"key":"ZyuDgSm8gN"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"Let’s suppose you, the agent, are an advertising company. You have ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"gCcQiqQ7mo"},{"type":"inlineMath","value":"K","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"KKK","key":"t3kcDZYEUF"},{"type":"text","value":" different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"ug4zIYU4ut"},{"type":"text","value":"1","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"ATRewe7ke0"},{"type":"text","value":" reward if the user clicks the ad, and ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"aMP7vTBFr6"},{"type":"text","value":"0","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"Pgrq6kmTCD"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"UlzRpPwVcN"},{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"t9wyZ0UDJN"}],"key":"QpPHOifugX"},{"type":"text","value":" associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user.","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"BM2SVAfQO1"}],"key":"IThuVMUCSM"}],"enumerator":"3.1","html_id":"advertising","key":"jVOva8654J"},{"type":"proof","kind":"example","label":"clinical_trials","identifier":"clinical_trials","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Clinical trials","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"eKmBpLsSYe"}],"key":"RhvIY4TGGh"},{"type":"paragraph","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Suppose you’re a pharmaceutical company, and you’re testing a new drug. You have ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"uVKkmmk6Q0"},{"type":"inlineMath","value":"K","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"html":"KKK","key":"JuPobLkI6K"},{"type":"text","value":" different dosages of the drug that you can administer to patients. You receive ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"Xd6YC3XpkV"},{"type":"text","value":"1","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"vE6SIvLuvl"},{"type":"text","value":" reward if the patient recovers, and ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"UA4ZpP1GJ1"},{"type":"text","value":"0","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"qysCiitSW5"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"uWo4BEcRJx"},{"type":"emphasis","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"F8jCjqr7ee"}],"key":"FxO5GGExHK"},{"type":"text","value":" associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"tijtyFSJa2"}],"key":"Fw5dUVzFPc"}],"enumerator":"3.2","html_id":"clinical-trials","key":"I00aNYDPA5"},{"type":"paragraph","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"Y5wcJz25dC"}],"key":"Pzuq6M08TZ"}],"key":"jFjze1Y89i"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nimport numpy as np\nimport latexify\nfrom typing import Callable, Union\nimport matplotlib.pyplot as plt\n\nimport solutions.bandits as solutions\n\nnp.random.seed(184)\n\ndef random_argmax(ary: Array) -> int:\n \"\"\"Take an argmax and randomize between ties.\"\"\"\n max_idx = np.flatnonzero(ary == ary.max())\n return np.random.choice(max_idx).item()\n\n\n# used as decorator\nlatex = latexify.algorithmic(\n prefixes={\"mab\"},\n identifiers={\"arm\": \"a_t\", \"reward\": \"r\", \"means\": \"mu\"},\n use_math_symbols=True,\n escape_underscores=False,\n)","key":"qBizE48yuK"},{"type":"output","id":"l8Ee1JpFSjyERlBUKToxn","data":[],"key":"tFGyzvTdX7"}],"data":{},"key":"E24c9ByDLZ"},{"type":"block","position":{"start":{"line":72,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"proof","kind":"remark","label":"multi-armed","identifier":"multi-armed","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Namesake","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"key":"K2jbObKdmM"}],"key":"akakhwT38s"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"The name “multi-armed bandits” comes from slot machines in casinos, which are often called “one-armed bandits” since they have one arm (the lever) and take money from the player.","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"LrCCnFq8q7"}],"key":"jYSCyHlwWW"}],"enumerator":"3.1","html_id":"multi-armed","key":"BFr1O87qv0"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"QnVmwqlBlS"},{"type":"inlineMath","value":"K","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"KKK","key":"AAVSQe7ZRf"},{"type":"text","value":" denote the number of arms. We’ll label them ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"qbCT7WVGLc"},{"type":"inlineMath","value":"0, \\dots, K-1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"0,,K10, \\dots, K-10,,K1","key":"PJml63Isfp"},{"type":"text","value":" and use ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"dAoWkx6Roe"},{"type":"emphasis","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"superscripts","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"y61AETmNEq"}],"key":"PLWQMT3PRe"},{"type":"text","value":" to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"k7nL2caOZc"},{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Bernoulli bandit","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"lc071ZCRLa"}],"key":"YuJ8qxCQtS"},{"type":"text","value":" setting from the examples above, where arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"T0ip8W8Y8j"},{"type":"inlineMath","value":"k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"kkk","key":"lcumJ3wSen"},{"type":"text","value":" either returns reward ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"OivvQdQdos"},{"type":"text","value":"1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"LEahyDogV7"},{"type":"text","value":" with probability ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"dURDrHHVij"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"μk\\mu^kμk","key":"EP07wSpd89"},{"type":"text","value":" or ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"jlRPPQuxb4"},{"type":"text","value":"0","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"julocz3r8h"},{"type":"text","value":" otherwise. The agent gets to pull an arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"FGOhQ858CR"},{"type":"inlineMath","value":"T","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"TTT","key":"kwbPSOljxi"},{"type":"text","value":" times in total. We can formalize the Bernoulli bandit in the following Python code:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"l7xf0ZdH0X"}],"key":"eYOgXH0F3m"}],"key":"PcWGtKVt78"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MAB:\n \"\"\"\n The Bernoulli multi-armed bandit environment.\n\n :param means: the means (success probabilities) of the reward distributions for each arm\n :param T: the time horizon\n \"\"\"\n\n def __init__(self, means: Float[Array, \" K\"], T: int):\n assert all(0 <= p <= 1 for p in means)\n self.means = means\n self.T = T\n self.K = self.means.size\n self.best_arm = random_argmax(self.means)\n\n def pull(self, k: int) -> int:\n \"\"\"Pull the `k`-th arm and sample from its (Bernoulli) reward distribution.\"\"\"\n reward = np.random.rand() < self.means[k].item()\n return +reward","key":"v3QHMv4tvR"},{"type":"output","id":"YT5pp7-E7fMU1sLD-j-da","data":[],"key":"pxZSLaPTG1"}],"data":{},"key":"CHaC55GGH1"},{"type":"block","children":[],"key":"QObO1Kgr3d"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)","key":"VvZIBNB3rd"},{"type":"output","id":"i3l2RaxCK4ApIBrxFqO1e","data":[],"key":"nZH5su5qIh"}],"data":{},"key":"AoYiPlAk52"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"In pseudocode, the agent’s interaction with the MAB environment can be\ndescribed by the following process:","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"kqCuFdEFIe"}],"key":"SZhZR58IYE"}],"key":"SDuUJcOyzv"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"@latex\ndef mab_loop(mab: MAB, agent: \"Agent\") -> int:\n for t in range(mab.T):\n arm = agent.choose_arm() # in 0, ..., K-1\n reward = mab.pull(arm)\n agent.update_history(arm, reward)\n\n\nmab_loop","key":"GvcigxOf5w"},{"type":"output","id":"oM-mjePiWu5k-wxFiwoTZ","data":[{"output_type":"execute_result","execution_count":4,"metadata":{},"data":{"text/plain":{"content":"","content_type":"text/plain"},"text/latex":{"content":"$ \\begin{array}{l} \\mathbf{function} \\ \\mathrm{mab\\_loop}(\\mathrm{mab}, \\mathrm{agent}) \\\\ \\hspace{1em} \\mathbf{for} \\ t \\in \\mathrm{range} \\mathopen{}\\left( T \\mathclose{}\\right) \\ \\mathbf{do} \\\\ \\hspace{2em} \\mathrm{a\\_t} \\gets \\mathrm{agent}.\\mathrm{choose\\_arm} \\mathopen{}\\left( \\mathclose{}\\right) \\\\ \\hspace{2em} r \\gets \\mathrm{pull} \\mathopen{}\\left( \\mathrm{a\\_t} \\mathclose{}\\right) \\\\ \\hspace{2em} \\mathrm{agent}.\\mathrm{update\\_history} \\mathopen{}\\left( \\mathrm{a\\_t}, r \\mathclose{}\\right) \\\\ \\hspace{1em} \\mathbf{end \\ for} \\\\ \\mathbf{end \\ function} \\end{array} $","content_type":"text/latex"}}}],"key":"p2g0xcbT76"}],"data":{},"key":"fG4Pyfm3v5"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"VPV03y3DfD"},{"type":"inlineCode","value":"Agent","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"nXXqd52Bln"},{"type":"text","value":" class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"CNwgQUZn4a"},{"type":"inlineMath","value":"\\mathbb{N}^{K \\times 2}","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"NK×2\\mathbb{N}^{K \\times 2}NK×2","key":"A8jYNSXOmz"},{"type":"text","value":" array.","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"rAL6QPBsXZ"}],"key":"PYuaNNpwHL"}],"key":"M4p7qQMpiJ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Agent:\n def __init__(self, K: int, T: int):\n \"\"\"The MAB agent that decides how to choose an arm given the past history.\"\"\"\n self.K = K\n self.T = T\n self.rewards = [] # for plotting\n self.choices = []\n self.history = np.zeros((K, 2), dtype=int)\n\n def choose_arm(self) -> int:\n \"\"\"Choose an arm of the MAB. Algorithm-specific.\"\"\"\n ...\n\n def count(self) -> int:\n \"\"\"The number of pulls made. Also the current step index.\"\"\"\n return len(self.rewards)\n\n def update_history(self, arm: int, reward: int):\n self.rewards.append(reward)\n self.choices.append(arm)\n self.history[arm, reward] += 1","key":"jhDhrSohEl"},{"type":"output","id":"AkvgXbCXLYjBXqPUmlX6t","data":[],"key":"OsTG3YBPM7"}],"data":{},"key":"pnSVXBWF1p"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"What’s the ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"tD7M6rAyn4"},{"type":"emphasis","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"i08N7pXVrd"}],"key":"hRhgKKFW7h"},{"type":"text","value":" strategy for the agent, i.e. the one that achieves\nthe highest expected reward? Convince yourself that the agent should try\nto always pull the arm with the highest expected reward:","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"xU5RYidXEq"}],"key":"q8kZ5QrIEp"},{"type":"math","value":"\\mu^\\star := \\max_{k \\in [K]} \\mu^k.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"μ:=maxk[K]μk.\\mu^\\star := \\max_{k \\in [K]} \\mu^k.μ:=k[K]maxμk.","enumerator":"3.1","key":"c9I8K7LhET"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"The goal, then, can be rephrased as to minimize the ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"mhKbcRZn8F"},{"type":"strong","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"PVWvGDnhOu"}],"key":"yUWWLVYsPx"},{"type":"text","value":", defined\nbelow:","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"QuruEBIYzC"}],"key":"Mm8N1r2sJ5"},{"type":"proof","kind":"definition","label":"regret","identifier":"regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Regret","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"wsdQWQz2Fp"}],"key":"J9WE6Q5mxf"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"The agent’s ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"gaRvtIT2fA"},{"type":"strong","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"AHW93m8kzI"}],"key":"yHy1QQk45z"},{"type":"text","value":" after ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"Ln2c3Bs7RD"},{"type":"inlineMath","value":"T","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"TTT","key":"Qas0x4j0uJ"},{"type":"text","value":" timesteps is defined as","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"yzkwqDuLAY"}],"key":"m5j0yjqv3Q"},{"type":"math","value":"\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.","position":{"start":{"line":163,"column":1},"end":{"line":165,"column":1}},"html":"RegretT:=t=0T1μμat.\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.RegretT:=t=0T1μμat.","enumerator":"3.2","key":"zFwZvOkrtA"}],"enumerator":"3.1","html_id":"regret","key":"as5etfdEWN"}],"key":"UAqUrpJe0i"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def regret_per_step(mab: MAB, agent: Agent):\n \"\"\"Get the difference from the average reward of the optimal arm. The sum of these is the regret.\"\"\"\n return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]","key":"pGds6gMxSC"},{"type":"output","id":"REa9p9MtSB8lOiaPV3a3c","data":[],"key":"jyjWCdpjMm"}],"data":{},"key":"DvaUyuub1j"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":178,"column":1}},"children":[{"type":"text","value":"Note that this depends on the ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"k1vvYYORbd"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"true means","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"C3zxsSLgrv"}],"key":"teu0ghNWYA"},{"type":"text","value":" of the pulled arms, ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"UIcBBNBErF"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"GfV5HH9WOZ"}],"key":"gPw4fcdEgK"},{"type":"text","value":" the actual\nobserved rewards.\nWe typically think of this as a random variable where\nthe randomness comes from the agent’s strategy (i.e. the sequence of\nactions ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"KLmRt7QWS0"},{"type":"inlineMath","value":"a_0, \\dots, a_{T-1}","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"html":"a0,,aT1a_0, \\dots, a_{T-1}a0,,aT1","key":"F0AOkSY2RJ"},{"type":"text","value":").","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"GZa5Q8f59m"}],"key":"xTteL4qxVH"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"Throughout the chapter, we will try to upper bound the regret of various\nalgorithms in two different senses:","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"AP7s0stgfx"}],"key":"N30ZIzO2Sh"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":183,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":183,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"text","value":"Upper bound the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"ISHbNaXd5z"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"expected regret,","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"dKToeUPrlr"}],"key":"KA6D8r5ANk"},{"type":"text","value":" i.e. show\n","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"lmejy9M4Cl"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] \\le M_T","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"E[RegretT]MT\\E[\\text{Regret}_T] \\le M_TE[RegretT]MT","key":"gzZkzEwVhY"},{"type":"text","value":".","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"vTU3iSubDw"}],"key":"K8JaAqJ8M3"}],"key":"lZ12SHNYP8"},{"type":"listItem","spread":true,"position":{"start":{"line":186,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":186,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"Find a ","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"ry1d938n7L"},{"type":"emphasis","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"text","value":"high-probability","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"Z3BAd3zGVl"}],"key":"OISdaejpaa"},{"type":"text","value":" upper bound on the regret, i.e. show\n","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"r8hbIOolRw"},{"type":"inlineMath","value":"\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\delta","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"html":"P(RegretTMT,δ)1δ\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\deltaP(RegretTMT,δ)1δ","key":"K3RFnyhQJp"},{"type":"text","value":".","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"yJgx3WYWqU"}],"key":"vBeR7rJ2Ez"}],"key":"LeRW8AMn8w"}],"key":"jNZdvjbnKS"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"Note that these two different approaches say very different things about the regret. The first approach says that the ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"scaa73wAGm"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"j4I1p0GwlP"}],"key":"YIX94eMmHD"},{"type":"text","value":" regret is at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"AIlA2FVDlu"},{"type":"inlineMath","value":"M_T","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MTM_TMT","key":"JvvsNZEYdH"},{"type":"text","value":". However, the agent might still achieve higher regret on many runs. The second approach says that, ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"YIMJOd85A5"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"with high probability","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"gUXNJRHp04"}],"key":"NG8vmYOeeR"},{"type":"text","value":", the agent will achieve regret at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"VxhM1DTkPP"},{"type":"inlineMath","value":"M_{T, \\delta}","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MT,δM_{T, \\delta}MT,δ","key":"PEObJqZzPA"},{"type":"text","value":". However, it doesn’t say anything about the regret in the remaining ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"SLli5M8PSa"},{"type":"text","value":"δ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"auRNf2IjYs"},{"type":"text","value":" fraction of runs, which might be arbitrarily high.","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"JSHPFcTAQM"}],"key":"ihy3o1cl9Q"},{"type":"paragraph","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"We’d like to achieve ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"I09D2b9G5R"},{"type":"strong","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"sublinear regret","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"GPeP8BP7PC"}],"key":"l0EjFQR1mI"},{"type":"text","value":" in expectation, i.e. ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"BEsr62C4pf"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] = o(T)","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"html":"E[RegretT]=o(T)\\E[\\text{Regret}_T] = o(T)E[RegretT]=o(T)","key":"K5AjqGpz01"},{"type":"text","value":". That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"ijIJxogx6H"}],"key":"MJ5u4ydxi7"},{"type":"paragraph","position":{"start":{"line":193,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"The rest of the chapter comprises a series of increasingly sophisticated\nMAB algorithms.","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"KzW4V4ikkX"}],"key":"wYdoVIOC8F"}],"key":"QOI0z1mTqc"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def plot_strategy(mab: MAB, agent: Agent):\n plt.figure(figsize=(10, 6))\n\n # plot reward and cumulative regret\n plt.plot(np.arange(mab.T), np.cumsum(agent.rewards), label=\"reward\")\n cum_regret = np.cumsum(regret_per_step(mab, agent))\n plt.plot(np.arange(mab.T), cum_regret, label=\"cumulative regret\")\n\n # draw colored circles for arm choices\n colors = [\"red\", \"green\", \"blue\"]\n color_array = [colors[k] for k in agent.choices]\n plt.scatter(np.arange(mab.T), np.zeros(mab.T), c=color_array, label=\"arm\")\n\n # labels and title\n plt.xlabel(\"timestep\")\n plt.legend()\n plt.title(f\"{agent.__class__.__name__} reward and regret\")\n plt.show()","visibility":"hide","key":"Oic0T1rJcU"},{"type":"output","id":"AZ41M1n44V_0VeO3N1Hto","data":[],"visibility":"show","key":"NZlsuX2mfT"}],"data":{"tags":[]},"visibility":"show","key":"OvhrYgyK83"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"children":[{"type":"text","value":"Pure exploration (random guessing)","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"XrNgr1WZ4t"}],"identifier":"pure-exploration-random-guessing","label":"Pure exploration (random guessing)","html_id":"pure-exploration-random-guessing","implicit":true,"enumerator":"3.2","key":"GY7hYNwhX7"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":222,"column":1}},"children":[{"type":"text","value":"A trivial strategy is to always choose arms at random (i.e. “pure\nexploration”).","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"I93DD8hfYJ"}],"key":"LC8MFtNM3d"}],"key":"Kd6hQsHvbx"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureExploration(Agent):\n def choose_arm(self):\n \"\"\"Choose an arm uniformly at random.\"\"\"\n return solutions.pure_exploration_choose_arm(self)","identifier":"pure_exploration-code","enumerator":"3.1","html_id":"pure-exploration-code","key":"nJi6jzJGDH"},{"type":"output","id":"CYL64WKy-HupL1TMPiNdK","data":[],"identifier":"pure_exploration-output","enumerator":"3.1","html_id":"pure-exploration-output","key":"I6UCgjXRme"}],"data":{},"label":"pure_exploration","identifier":"pure_exploration","enumerator":"3.1","html_id":"pure-exploration","key":"HGM8kCZQy0"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Note that","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"VJxfJu5wOD"}],"key":"mUqsSCdafE"},{"type":"math","value":"\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^k","position":{"start":{"line":235,"column":1},"end":{"line":237,"column":1}},"html":"EatUnif([K])[μat]=μˉ=1Kk=1Kμk\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^kEatUnif([K])[μat]=μˉ=K1k=1Kμk","enumerator":"3.3","key":"McWlDcTBD3"},{"type":"paragraph","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"so the expected regret is simply","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"DapMwihUQf"}],"key":"o8nKb02Rge"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}","position":{"start":{"line":241,"column":1},"end":{"line":246,"column":1}},"html":"E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.","enumerator":"3.4","key":"CvhMBEldvp"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"This scales as ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"vQRHjcaicx"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"mcCgozONmv"},{"type":"text","value":", i.e. ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"A0PkS3KdEO"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"zQPAmSzlp8"}],"key":"E145ZElBWQ"},{"type":"text","value":" in the number of timesteps ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"cWYCMgpf9I"},{"type":"inlineMath","value":"T","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"TTT","key":"gs5mvlkSF9"},{"type":"text","value":". There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"krJepPQDdz"}],"key":"crMdRwKQ9t"}],"key":"phRU2er2jX"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureExploration(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"gjkXdiXE45"},{"type":"output","id":"C3gMzfqn4mzbAemRs1Ex4","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"e018a4b689feff2c40f2483432d7c76f","path":"/build/e018a4b689feff2c40f2483432d7c76f.png"}}}],"key":"AWJsX12lwK"}],"data":{},"key":"K7Or4eKDHy"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Pure greedy","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"dn5Z1rSw8M"}],"identifier":"pure-greedy","label":"Pure greedy","html_id":"pure-greedy","implicit":true,"enumerator":"3.3","key":"RrlI3HHyGj"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"text","value":"How might we improve on pure exploration? Instead, we could try each arm\nonce, and then commit to the one with the highest observed reward. We’ll\ncall this the ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"IzSIjRPWQz"},{"type":"strong","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"pure greedy","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"oQwuaiSKIq"}],"key":"tTwTY9nElw"},{"type":"text","value":" strategy.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"UFaWLe94Zx"}],"key":"pddHWkIaEu"}],"key":"miSJgYJJKe"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureGreedy(Agent):\n def choose_arm(self):\n \"\"\"Choose the arm with the highest observed reward on its first pull.\"\"\"\n return solutions.pure_greedy_choose_arm(self)","identifier":"pure_greedy-code","enumerator":"3.2","html_id":"pure-greedy-code","key":"Mzdo4LbCKW"},{"type":"output","id":"N91WrpQsyAToqgqG-wcaN","data":[],"identifier":"pure_greedy-output","enumerator":"3.2","html_id":"pure-greedy-output","key":"rNBXfWm1nk"}],"data":{},"label":"pure_greedy","identifier":"pure_greedy","enumerator":"3.2","html_id":"pure-greedy","key":"zXmsF7TJYC"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Note we’ve used superscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"vXNMuNAMkq"},{"type":"inlineMath","value":"r^k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rkr^krk","key":"znog9vbQqa"},{"type":"text","value":" during the exploration phase to\nindicate that we observe exactly one reward for each arm. Then we use\nsubscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"tosD5fW4rX"},{"type":"inlineMath","value":"r_t","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rtr_trt","key":"z4j7Us7n7o"},{"type":"text","value":" during the exploitation phase to indicate that we\nobserve a sequence of rewards from the chosen greedy arm ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"BTvgmJN88l"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"k^\\hat kk^","key":"p4PsPKZbDT"},{"type":"text","value":".","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"kMYLzL5trj"}],"key":"owJI3CwPOS"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":279,"column":1}},"children":[{"type":"text","value":"How does the expected regret of this strategy compare to that of pure\nexploration? We’ll do a more general analysis in the following section.\nNow, for intuition, suppose there’s just ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"DlwDmWADm4"},{"type":"inlineMath","value":"K=2","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"K=2K=2K=2","key":"YBiMOpxfhY"},{"type":"text","value":" arms, with Bernoulli\nreward distributions with means ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"YCjpPEjs4e"},{"type":"inlineMath","value":"\\mu^0 > \\mu^1","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"μ0>μ1\\mu^0 > \\mu^1μ0>μ1","key":"TqXBm2U0O8"},{"type":"text","value":".","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"qnonrh1cXC"}],"key":"TjrCvGp95n"},{"type":"paragraph","position":{"start":{"line":281,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Let’s let ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"t4zhwZTk6y"},{"type":"inlineMath","value":"r^0","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0r^0r0","key":"FFvrA83c0j"},{"type":"text","value":" be the random reward from the first arm and ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"wZqYVGKAJM"},{"type":"inlineMath","value":"r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r1r^1r1","key":"gLtpPZZduE"},{"type":"text","value":" be the\nrandom reward from the second. If ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"aEgJKMaM6j"},{"type":"inlineMath","value":"r^0 > r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0>r1r^0 > r^1r0>r1","key":"ahiuNyWLAT"},{"type":"text","value":", then we achieve zero\nregret. Otherwise, we achieve regret ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"AbLTSQ0jMI"},{"type":"inlineMath","value":"T(\\mu^0 - \\mu^1)","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"T(μ0μ1)T(\\mu^0 - \\mu^1)T(μ0μ1)","key":"DnZdA9QuH5"},{"type":"text","value":". Thus, the\nexpected regret is simply:","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"yeGEp3Gwk6"}],"key":"BJEGcZmewd"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}","position":{"start":{"line":286,"column":1},"end":{"line":291,"column":1}},"html":"E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c","enumerator":"3.5","key":"Yfqg35sOG7"},{"type":"paragraph","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"children":[{"type":"text","value":"Which is still ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"bIuruILtBq"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"U2vFkg9ICC"},{"type":"text","value":", the same as pure exploration!","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"r9iOIfZzRj"}],"key":"J8YDFbGVH1"}],"key":"MgrSx7f5nz"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureGreedy(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"PJDYrjssvQ"},{"type":"output","id":"7l2u3ZduK6RNx1FCNex0s","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"60449ce2034aedba8d659c77e97c9729","path":"/build/60449ce2034aedba8d659c77e97c9729.png"}}}],"key":"ZyHXGSzbNe"}],"data":{},"key":"yrVuffW2Wv"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"czGUvJi7f4"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"mFnuWVxz8M"}],"key":"ZPa5HNH0i3"},{"type":"text","value":" regret is what measures its effectiveness.","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"kaoFaq7YTV"}],"key":"K0AqEXbylo"}],"key":"I6HoRyHEB8"},{"type":"block","position":{"start":{"line":303,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Explore-then-commit","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"VbuceAIyxC"}],"label":"etc","identifier":"etc","html_id":"etc","enumerator":"3.4","key":"xz3t3aG6Lf"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"aUQh0Wkzfe"},{"type":"inlineMath","value":"N_{\\text{explore}}> 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore>1N_{\\text{explore}}> 1Nexplore>1","key":"kHWyGALp9V"},{"type":"text","value":" times before committing. This is called the ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"musDaQJlpD"},{"type":"strong","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"explore-then-commit","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"olLoghe0iV"}],"key":"GPI7qcirTZ"},{"type":"text","value":" strategy. Note that the “pure greedy” strategy above is just the special case where\n","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"hv0Bjmcgg0"},{"type":"inlineMath","value":"N_{\\text{explore}}= 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore=1N_{\\text{explore}}= 1Nexplore=1","key":"Q0GQOAQiYC"},{"type":"text","value":".","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"CZyo4tbmSg"}],"key":"mlRhQtcCCT"}],"key":"gNkE3ajUJh"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ExploreThenCommit(Agent):\n def __init__(self, K: int, T: int, N_explore: int):\n super().__init__(K, T)\n self.N_explore = N_explore\n\n def choose_arm(self):\n return solutions.etc_choose_arm(self)","key":"fgXmA4onqk"},{"type":"output","id":"0hKyyb9r9RiexuVOOekEo","data":[],"key":"ou6wH9a6iX"}],"data":{},"key":"BA867pEcfx"},{"type":"block","children":[],"key":"t1Nc5FoA02"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"SJ6iSIZ7mU"},{"type":"output","id":"LPQZoz26smeVxp6a1OcOt","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"dde6263087532775cde0fb2de5a471cc","path":"/build/dde6263087532775cde0fb2de5a471cc.png"}}}],"key":"PvdWlZ37IK"}],"data":{},"key":"MP0FXvoRp3"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"children":[{"type":"text","value":"Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"W8anny8mtg"}],"key":"Q11saI2AWL"}],"key":"M6a1NGEKvJ"},{"type":"block","position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"children":[{"type":"text","value":"ETC regret analysis","position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"key":"CTP0mdUtFr"}],"label":"etc-regret-analysis","identifier":"etc-regret-analysis","html_id":"etc-regret-analysis","enumerator":"3.4.1","key":"FtiDgqDjqw"},{"type":"paragraph","position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up\ninto the exploration and exploitation phases.","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"YEaKdfOYq9"}],"key":"MImdorr0Dh"},{"type":"heading","depth":4,"position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"Exploration phase.","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"o4ot3AsMLu"}],"identifier":"exploration-phase","label":"Exploration phase.","html_id":"exploration-phase","implicit":true,"enumerator":"3.4.1.1","key":"pI4CD8dHvd"},{"type":"paragraph","position":{"start":{"line":339,"column":1},"end":{"line":341,"column":1}},"children":[{"type":"text","value":"This phase takes ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"H6yp5fAnfG"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"rgNYuBUDPE"},{"type":"text","value":" timesteps. Since at each step we\nincur at most ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"ejZZGJ0vFA"},{"type":"text","value":"1","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"tiOY7E34UP"},{"type":"text","value":" regret, the total regret is at most\n","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"zyacesOYHF"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"Sy0p1Lm27a"},{"type":"text","value":".","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"PO7oxbK17X"}],"key":"vX07R9lVet"},{"type":"heading","depth":4,"position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"Exploitation phase.","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"DZwyeL5S0H"}],"identifier":"exploitation-phase","label":"Exploitation phase.","html_id":"exploitation-phase","implicit":true,"enumerator":"3.4.1.2","key":"Xt5iTkOL3T"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"This will take a bit more effort. We’ll prove that for any total time ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"VLAjWiXhuN"},{"type":"inlineMath","value":"T","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"TTT","key":"np19DN3L7p"},{"type":"text","value":", we can choose ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"AANDMN56k3"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"WLYjZFGcoX"},{"type":"text","value":" such that with arbitrarily high probability, the regret is sublinear.","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"tVf3sfPHx0"}],"key":"eYjUrt90CB"},{"type":"paragraph","position":{"start":{"line":347,"column":1},"end":{"line":348,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"XGM27UACNA"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"html":"k^\\hat kk^","key":"mxLjYt8uY5"},{"type":"text","value":" denote the arm chosen after the exploration phase. We know the regret from the\nexploitation phase is","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"koX3EkAdKb"}],"key":"cb2j9alwku"},{"type":"math","value":"T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"html":"Texploit(μμk^)whereTexploit:=TNexploreK.T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.Texploit(μμk^)whereTexploit:=TNexploreK.","enumerator":"3.6","key":"ksT5kKQeRW"},{"type":"paragraph","position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"So we’d like to bound ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"Hq44qlgMOu"},{"type":"inlineMath","value":"\\mu^\\star - \\mu^{\\hat k} = o(1)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"μμk^=o(1)\\mu^\\star - \\mu^{\\hat k} = o(1)μμk^=o(1)","key":"ls8VWk26S1"},{"type":"text","value":" (as a function\nof ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"UWc8OVj5QM"},{"type":"inlineMath","value":"T","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"TTT","key":"NgZWXqArQs"},{"type":"text","value":") in order to achieve sublinear regret. How can we do this?","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"PxN94ZsxHx"}],"key":"rQxPrzNhDD"},{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Let’s define ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"xsL960nfCc"},{"type":"inlineMath","value":"\\Delta^k = \\hat \\mu^k - \\mu^k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"Δk=μ^kμk\\Delta^k = \\hat \\mu^k - \\mu^kΔk=μ^kμk","key":"TXvyShTdPP"},{"type":"text","value":" to denote how far the mean\nestimate for arm ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"oUYaodgn5s"},{"type":"inlineMath","value":"k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"kkk","key":"MG5MLMiAza"},{"type":"text","value":" is from the true mean. How can we bound this\nquantity? We’ll use the following useful inequality for i.i.d. bounded\nrandom variables:","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"NHjQST74OP"}],"key":"WGD9KJJW0z"},{"type":"proof","kind":"theorem","label":"hoeffding","identifier":"hoeffding","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Hoeffding’s inequality","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"jFnu3d8lhx"}],"key":"Slby69vYPE"},{"type":"paragraph","position":{"start":{"line":363,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"tFBNfPSCcs"},{"type":"inlineMath","value":"X_0, \\dots, X_{n-1}","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"X0,,Xn1X_0, \\dots, X_{n-1}X0,,Xn1","key":"cXiwrix6tT"},{"type":"text","value":" be i.i.d. random variables with\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"q0JSphKHFQ"},{"type":"inlineMath","value":"X_i \\in [0, 1]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"Xi[0,1]X_i \\in [0, 1]Xi[0,1]","key":"hRUHtLZh56"},{"type":"text","value":" almost surely for each ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"Jr5mAmf11P"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"ldsHsVmOYH"},{"type":"text","value":". Then for any\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"DJ9yw7d7TR"},{"type":"inlineMath","value":"\\delta > 0","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"δ>0\\delta > 0δ>0","key":"q7it9iL85v"},{"type":"text","value":",","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"H0ZnakDXZ7"}],"key":"pGvP3GXcgL"},{"type":"math","value":"\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"P(1ni=1n(XiE[Xi])>ln(2/δ)2n)δ.\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.P(n1i=1n(XiE[Xi])>2nln(2/δ))δ.","enumerator":"3.7","key":"PfJupg0uu8"}],"enumerator":"3.1","html_id":"hoeffding","key":"Z6lsVpI0aJ"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"The proof of this inequality is beyond the scope of this book. See ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"tuoE0gGl8h"},{"type":"cite","kind":"narrative","label":"vershynin_high-dimensional_2018","identifier":"vershynin_high-dimensional_2018","children":[{"type":"text","value":"Vershynin (2018)","key":"Z1bsG6WGFe"}],"enumerator":"1","key":"lMXb0pKatl"},{"type":"text","value":" Chapter 2.2.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"Ou1pwJsDvz"}],"key":"qAyvKVrEGl"},{"type":"paragraph","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"children":[{"type":"text","value":"We can apply this directly to the rewards for a given arm ","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"zBFxE3Fyt7"},{"type":"inlineMath","value":"k","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"html":"kkk","key":"gSoqxAj2au"},{"type":"text","value":", since the rewards from that arm are i.i.d.:","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"dIbV6DN4rb"}],"key":"BsVRhUn8Dw"},{"type":"math","value":"\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.","label":"hoeffding-etc","identifier":"hoeffding-etc","html":"P(Δk>ln(2/δ)2Nexplore)δ.\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.P(Δk>2Nexploreln(2/δ))δ.","enumerator":"3.8","html_id":"hoeffding-etc","key":"I9FH476mJY"},{"type":"paragraph","position":{"start":{"line":380,"column":1},"end":{"line":384,"column":1}},"children":[{"type":"text","value":"But note that we can’t apply this to arm ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"jxI2QjgqGR"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"gebEx97mf3"},{"type":"text","value":" directly since\n","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"rEIGEsfhX0"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"SbiPOv75Rm"},{"type":"text","value":" is itself a random variable. Instead, we need to “uniform-ize”\nthis bound across ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"y5ESBrBkY5"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"all","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"qxtbhQvvsY"}],"key":"uJp3syHPGi"},{"type":"text","value":" the arms, i.e. bound the error across all the\narms simultaneously, so that the resulting bound will apply ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"hcAPuUYLj7"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"no matter\nwhat","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"rABlBb9GV1"}],"key":"aO3gVXbBiE"},{"type":"text","value":" ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"vEJehFmUm0"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"jjhDxwQ0kT"},{"type":"text","value":" “crystallizes” to.","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"g96WCulj0Z"}],"key":"P2xIcGPGEL"},{"type":"paragraph","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"mI75Ak1dv4"},{"type":"strong","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"QxzZkXKD84"}],"key":"dkFS096DAr"},{"type":"text","value":" provides a simple way to do this:","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"nXqpmLit8t"}],"key":"pW8IwxwKnE"},{"type":"proof","kind":"theorem","label":"union_bound","identifier":"union_bound","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Union bound","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"FnlpsRYqeG"}],"key":"Od1DYrJS8r"},{"type":"paragraph","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"Consider a set of events ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"ay1yGUOxZl"},{"type":"inlineMath","value":"A_0, \\dots, A_{n-1}","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"html":"A0,,An1A_0, \\dots, A_{n-1}A0,,An1","key":"OtNqqAEMIC"},{"type":"text","value":". Then","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"R5G9liaDaT"}],"key":"xq0ARXozI6"},{"type":"math","value":"\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"html":"P(i[n].Ai)i=0n1P(Ai).\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).P(i[n].Ai)i=0n1P(Ai).","enumerator":"3.9","key":"O0UKck4NE2"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":396,"column":1}},"children":[{"type":"text","value":"In\nparticular, if ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"M4jAYfySJz"},{"type":"inlineMath","value":"\\pr(A_i) \\ge 1 - \\delta","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"P(Ai)1δ\\pr(A_i) \\ge 1 - \\deltaP(Ai)1δ","key":"hjQlZmm4sz"},{"type":"text","value":" for each ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"ecjNjdDaGM"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"Z0oriJzbs4"},{"type":"text","value":", we have","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"Bkg2UZHPyx"}],"key":"YvqEdLdCH0"},{"type":"math","value":"\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"P(i[n].Ai)1nδ.\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.P(i[n].Ai)1nδ.","enumerator":"3.10","key":"zyuZSSIEsR"}],"enumerator":"3.2","html_id":"union-bound","key":"JSOc2hr3eE"},{"type":"paragraph","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"strong","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"eiw7QzDLMk"}],"key":"mp776AehnK"},{"type":"text","value":" Prove the second statement above.","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"uLSeGrQV2w"}],"key":"jcFNX6Tu3g"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Applying the union bound across the arms for the l.h.s. event of ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Unh1yItP9p"},{"type":"crossReference","kind":"equation","identifier":"hoeffding-etc","label":"hoeffding-etc","children":[{"type":"text","value":"(","key":"VY8hij6os2"},{"type":"text","value":"3.8","key":"pE5PjMpWa2"},{"type":"text","value":")","key":"hBlJB8bJ6z"}],"template":"(%s)","enumerator":"3.8","resolved":true,"html_id":"hoeffding-etc","key":"jwgzfWwjaV"},{"type":"text","value":", we have","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"XOxkp1FCqB"}],"key":"HsVyWYBPId"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}","position":{"start":{"line":405,"column":1},"end":{"line":409,"column":1}},"html":"P(k[K],Δkln(2/δ)2Nexplore)1Kδ\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}P(k[K],Δk2Nexploreln(2/δ))1","enumerator":"3.11","key":"KUEjTCSotj"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"Then to apply this bound to ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"WzVJumG0lo"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"k^\\hat kk^","key":"mdoabUUmgi"},{"type":"text","value":" in particular, we\ncan apply the useful trick of “adding zero”:","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"gtC6dxl8I1"}],"key":"xe6yRhPMQW"},{"type":"math","value":"\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}","position":{"start":{"line":414,"column":1},"end":{"line":420,"column":1}},"html":"μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+0 by definition of k^(μ^kμ^k^)22Nexploreln(2K/δ) with probability at least 1δ","enumerator":"3.12","key":"PMWW4LiDlE"},{"type":"paragraph","position":{"start":{"line":422,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"where we’ve set ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"ReFsBmPdCc"},{"type":"inlineMath","value":"\\delta' = K\\delta","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"δ=Kδ\\delta' = K\\deltaδ=","key":"R5kMlTnW97"},{"type":"text","value":". Putting this all\ntogether, we’ve shown that, with probability ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"oHQZeqv2VQ"},{"type":"inlineMath","value":"1 - \\delta'","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"1δ1 - \\delta'1δ","key":"OA5FQ4OW8T"},{"type":"text","value":",","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"wAneQSQKvJ"}],"key":"nWpo2bnUPT"},{"type":"math","value":"\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.RegretTNexploreK+TexploitNexplore2ln(2K/δ).","enumerator":"3.13","key":"zHzqH2FhpJ"},{"type":"paragraph","position":{"start":{"line":427,"column":1},"end":{"line":430,"column":1}},"children":[{"type":"text","value":"Note that it suffices for ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"mpDuwFrLnj"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"PgA13OR4tf"},{"type":"text","value":" to be on the order of\n","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"ous3sjwF5t"},{"type":"inlineMath","value":"\\sqrt{T}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"T\\sqrt{T}T","key":"FSQCQ6iwc8"},{"type":"text","value":" to achieve sublinear regret. In particular, we can find the\noptimal ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"juwpDYpB6d"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"Ff1bfnFP39"},{"type":"text","value":" by setting the derivative of the r.h.s. to\nzero:","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"L1eZIJVl8t"}],"key":"B9LuJE6L81"},{"type":"math","value":"\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}","position":{"start":{"line":432,"column":1},"end":{"line":437,"column":1}},"html":"0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}0Nexplore=KTexploit21Nexplore32ln(2K/δ)=(TexploitKln(2K/δ)/2)2/3","enumerator":"3.14","key":"IvZpHaSaTW"},{"type":"paragraph","position":{"start":{"line":439,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Plugging this into the expression for the regret, we\nhave (still with probability ","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"eIB2mzYWlQ"},{"type":"inlineMath","value":"1-\\delta'","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"html":"1δ1-\\delta'1δ","key":"tOAJWYJZCt"},{"type":"text","value":")","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"Ej8p84QGyz"}],"key":"fruLEgu55I"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}","position":{"start":{"line":442,"column":1},"end":{"line":447,"column":1}},"html":"RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}RegretT3T2/33Kln(2K/δ)/2=O~(T2/3K1/3).","enumerator":"3.15","key":"NaAEcTHLYk"},{"type":"paragraph","position":{"start":{"line":449,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"text","value":"The ETC algorithm is rather “abrupt” in that it switches from\nexploration to exploitation after a fixed number of timesteps. In\npractice, it’s often better to use a more gradual transition, which\nbrings us to the ","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"z6J20k9Gy0"},{"type":"emphasis","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"CBg4VfhGLg"}],"key":"XiNQrLxT0A"},{"type":"text","value":" algorithm.","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"yjfM1tRA60"}],"key":"wYMP4VqFHY"}],"key":"m4IAVUwKUy"},{"type":"block","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"children":[{"type":"text","value":"Epsilon-greedy","position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"key":"ScNU87ob7P"}],"identifier":"epsilon-greedy","label":"Epsilon-greedy","html_id":"epsilon-greedy","implicit":true,"enumerator":"3.5","key":"HLe3Pmd1G4"},{"type":"paragraph","position":{"start":{"line":458,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"Instead of doing all of the exploration and then all of the exploitation\nseparately – which additionally requires knowing the time horizon\nbeforehand – we can instead interleave exploration and exploitation by,\nat each timestep, choosing a random action with some probability. We\ncall this the ","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"JRGf8n8VRK"},{"type":"strong","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"XcxBTP0OoW"}],"key":"fHVsfrxyvl"},{"type":"text","value":" algorithm.","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"ubvfIvgMHE"}],"key":"BAoSB1dkQY"}],"key":"xrq0VhnJ0D"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class EpsilonGreedy(Agent):\n def __init__(\n self,\n K: int,\n T: int,\n ε_array: Float[Array, \" T\"],\n ):\n super().__init__(K, T)\n self.ε_array = ε_array\n\n def choose_arm(self):\n return solutions.epsilon_greedy_choose_arm(self)","key":"beEq8HSu8A"},{"type":"output","id":"i-wLI6sAn3uXtmQ0j3h6L","data":[],"key":"E43iouviES"}],"data":{},"key":"wfxdjWdTC6"},{"type":"block","children":[],"key":"r5QFKSQfL2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"NqXxEcog3F"},{"type":"output","id":"XBj39_TiqoqGWQoenned4","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"6ad1018e4c18668300eb6bbe80bdc84f","path":"/build/6ad1018e4c18668300eb6bbe80bdc84f.png"}}}],"key":"SzlzGkI4TY"}],"data":{},"key":"v8eGmQOrBl"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that we let ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"dNRGpI9VqU"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"S433pqSxNI"},{"type":"text","value":" vary over time. In particular, we might want to gradually ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"TsqgrxyR76"},{"type":"emphasis","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"decrease","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"HiofvMftbF"}],"key":"kupDfiFWa6"},{"type":"text","value":" ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"xQizjHIK9d"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"x3PHF9rScY"},{"type":"text","value":" as we learn more about the reward distributions and no longer need to spend time exploring.","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"Bq0qBFeGpV"}],"key":"pHCWqbxTZ3"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"IdEnmeBRt7"}],"key":"SMN1xjwoY2"},{"type":"paragraph","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"What is the expected regret of the algorithm if we set ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"OiGgHPZeTp"},{"type":"text","value":"ε","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"B2RPBIjwFI"},{"type":"text","value":" to be a constant?","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"l9qF4Z4Wfa"}],"key":"nyAbUJnVcT"}],"key":"N4TV9E8wCP"},{"type":"paragraph","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"children":[{"type":"text","value":"It turns out that setting ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"mbyFM7ECKr"},{"type":"inlineMath","value":"\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"ϵt=Kln(t)/t3\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}ϵt=3Kln(t)/t","key":"QIrXv0rlMT"},{"type":"text","value":" also achieves a regret of ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"QBj4i12A05"},{"type":"inlineMath","value":"\\tilde O(t^{2/3} K^{1/3})","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"O~(t2/3K1/3)\\tilde O(t^{2/3} K^{1/3})O~(t2/3K1/3)","key":"ANhoPXb1Dz"},{"type":"text","value":" (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"nEOgdB07wk"}],"key":"bdMO6HRS95"},{"type":"paragraph","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"In ETC, we had to set ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"FO844C9jT1"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"uU7Ob36I5t"},{"type":"text","value":" based on the total number of timesteps ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"aMUf7k0hU0"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"IkZsE8Y8AJ"},{"type":"text","value":". But the epsilon-greedy algorithm actually handles the exploration ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"j7s5X9Qo9A"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"automatically","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"KzaeiYGX3v"}],"key":"af4PjHMgai"},{"type":"text","value":": the regret rate holds for ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"hCjEAr0zKt"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"Swfp5nXkPw"}],"key":"bM3VjfQxwk"},{"type":"text","value":" ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"epdPmJregn"},{"type":"inlineMath","value":"t","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"ttt","key":"dWEx5auQEU"},{"type":"text","value":", and doesn’t depend on the final horizon ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"ss3iFn6Q2c"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"OD1jGjgbDj"},{"type":"text","value":".","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"Z7FKZPCkXE"}],"key":"omSk5NRzC7"},{"type":"paragraph","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"But the way these algorithms explore is rather naive: we’ve been exploring ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"lxIJSHuXRt"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"TKt1JgPq1x"}],"key":"yX5xYpPZNs"},{"type":"text","value":" across all the arms. But what if we could be smarter about it, and explore ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"zumS6tPFiG"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"more","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"yPxz0nAijh"}],"key":"aXiGeUIDTi"},{"type":"text","value":" for arms that we’re less certain about?","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"hH6nR29BSa"}],"key":"p561gtvsPn"}],"key":"CeJrdyAENy"},{"type":"block","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"aO4SDhKoDT"}],"label":"ucb","identifier":"ucb","html_id":"ucb","enumerator":"3.6","key":"HOfdi2H8FH"},{"type":"paragraph","position":{"start":{"line":502,"column":1},"end":{"line":506,"column":1}},"children":[{"type":"text","value":"To quantify how ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"DgWNE0OM2y"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"certain","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"mwB2sYmK90"}],"key":"f6uuB762vT"},{"type":"text","value":" we are about the mean of each arm, we’ll\ncompute ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"yhsZh9O8fA"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"confidence intervals","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"kfxabmcGXz"}],"key":"UIJU1V3ThL"},{"type":"text","value":" for our estimators, and then choose the\narm with the highest ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"bbpniCyXZ8"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"qE55ah2ZrX"}],"key":"sAjy9sWd9I"},{"type":"text","value":". This operates on the\nprinciple of ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"obT2zV766s"},{"type":"strong","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"the benefit of the doubt (i.e. optimism in the face of\nuncertainty)","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"eUmmBs0llK"}],"key":"yRg4cbkePy"},{"type":"text","value":": we’ll choose the arm that we’re most optimistic about.","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"UPGnxjXlkM"}],"key":"PAZElEc9nk"},{"type":"paragraph","position":{"start":{"line":508,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"text","value":"In particular, for each arm ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"InjZ1eCNac"},{"type":"inlineMath","value":"k","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"kkk","key":"HKWsyMGCwQ"},{"type":"text","value":" at time ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"kCWnFBMUoh"},{"type":"inlineMath","value":"t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"ttt","key":"fLO1Wwk0Jh"},{"type":"text","value":", we’d like to compute some\nupper confidence bound ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"OJQkW0nM1I"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"dwxSXoQILC"},{"type":"text","value":" such that ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"tJvzMQB8Ds"},{"type":"inlineMath","value":"\\hat \\mu^k_t \\le M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"μ^tkMtk\\hat \\mu^k_t \\le M^k_tμ^tkMtk","key":"CniN9VjD1k"},{"type":"text","value":" with\nhigh probability, and then choose ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"eGFrySKHRw"},{"type":"inlineMath","value":"a_t := \\arg \\max_{k \\in [K]} M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"at:=argmaxk[K]Mtka_t := \\arg \\max_{k \\in [K]} M^k_tat:=argmaxk[K]Mtk","key":"k9JEp9spqE"},{"type":"text","value":".\nBut how should we compute ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"PNqjFySZOc"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"smrolS4Kmn"},{"type":"text","value":"?","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"D7IMVLz6GG"}],"key":"aSrtr9qWzo"},{"type":"paragraph","position":{"start":{"line":513,"column":1},"end":{"line":519,"column":1}},"children":[{"type":"text","value":"In ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"gNnXFAUaeX"},{"type":"crossReference","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"Section ","key":"FjOBlL5MYz"},{"type":"text","value":"3.4.1","key":"OuTJ3NZL99"}],"identifier":"etc-regret-analysis","label":"etc-regret-analysis","kind":"heading","template":"Section %s","enumerator":"3.4.1","resolved":true,"html_id":"etc-regret-analysis","key":"PysScrs6w1"},{"type":"text","value":", we were able to compute this bound\nusing Hoeffding’s inequality, which assumes that the number of samples\nis ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"nn2srJT00v"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"XE8AFjsIJI"}],"key":"t7xJuFdO29"},{"type":"text","value":". This was the case in ETC (where we pull each arm\n","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"jVVRaYQGkS"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"yMwOU1KWUS"},{"type":"text","value":" times), but in UCB, the number of times we pull\neach arm depends on the agent’s actions, which in turn depend on the\nrandom rewards and are therefore stochastic. So we ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"W8ThxkeSTj"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"can’t","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"EKNxYu0o7S"}],"key":"im8TFQtVpg"},{"type":"text","value":" use\nHoeffding’s inequality directly.","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"uPT6tBHfJX"}],"key":"mYO0ieqVf3"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Instead, we’ll apply the same trick we used in the ETC analysis: we’ll\nuse the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"PVvJBkO5Re"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"uzwXSd48qE"}],"key":"GDZVxkq6tA"},{"type":"text","value":" to compute a ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"NnxQCNvlWa"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"looser","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"L5dk1KVDjU"}],"key":"g23jQuIeFM"},{"type":"text","value":" bound that holds\n","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"X24KBAiFTv"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Jp9W1knXxM"}],"key":"vnFQGlgQAL"},{"type":"text","value":" across all timesteps and arms. Let’s introduce some notation\nto discuss this.","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"JeukBqyqzJ"}],"key":"fhCXzCseE2"},{"type":"paragraph","position":{"start":{"line":526,"column":1},"end":{"line":528,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"p8QXycTItB"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"NtkN^k_tNtk","key":"YZjnCmHUT9"},{"type":"text","value":" denote the (random) number of times arm ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"m9XsL4fAca"},{"type":"inlineMath","value":"k","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"kkk","key":"N71zcO5Sz6"},{"type":"text","value":" has been pulled\nwithin the first ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"y6fPQWWOin"},{"type":"inlineMath","value":"t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"ttt","key":"T5qJHcihLc"},{"type":"text","value":" timesteps, and ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"WL1nV0WLPI"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"xqpUpDJXLG"},{"type":"text","value":" denote the sample\naverage of those pulls. That is,","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"PqBKNLPKVr"}],"key":"IgEOytZZWQ"},{"type":"math","value":"\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}","position":{"start":{"line":530,"column":1},"end":{"line":535,"column":1}},"html":"Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}Ntkμ^tk:=τ=0t11{aτ=k}:=Ntk1τ=0t11{aτ=k}rτ.","enumerator":"3.16","key":"RxnDNv16nF"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"To achieve the “fixed sample size” assumption, we’ll\nneed to shift our index from ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"dHGUEu4WiL"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"time","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"s25ZeZbluN"}],"key":"aQ6dyqKHT6"},{"type":"text","value":" to ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"HvcMXzKzTo"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"number of samples from each\narm","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"RSc1Z7Dh2a"}],"key":"X53DF1R2gI"},{"type":"text","value":". In particular, we’ll define ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"IxOOJXCu0j"},{"type":"inlineMath","value":"\\tilde r^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"r~nk\\tilde r^k_nr~nk","key":"KLar4pdK9O"},{"type":"text","value":" to be the ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"eQvtIVy9Br"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"tP71wgWbJY"},{"type":"text","value":"th sample\nfrom arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"sfY1izHwX4"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"NNW4t8VGuQ"},{"type":"text","value":", and ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"BuKgJPhznD"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"BcdDefBfC3"},{"type":"text","value":" to be the sample average of the first\n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"cZwp5pVTm9"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"D8HswYeaxU"},{"type":"text","value":" samples from arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"IQyqLEn9Kl"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"MtOl6k7WLi"},{"type":"text","value":". Then, for a fixed ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"oOvgkSz9YN"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"dGlRXczpZy"},{"type":"text","value":", this satisfies the\n“fixed sample size” assumption, and we can apply Hoeffding’s inequality\nto get a bound on ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"v2tfABgizO"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"zNyeTYnlyJ"},{"type":"text","value":".","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"EEUBCL6AK8"}],"key":"jNCEZ3H79v"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":549,"column":1}},"children":[{"type":"text","value":"So how can we extend our bound on ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"Cm8C3kJ1Ip"},{"type":"inlineMath","value":"\\tilde\\mu^k_n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ~nk\\tilde\\mu^k_nμ~nk","key":"JnRCkDCDyC"},{"type":"text","value":" to ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"VRHDUc2tZ2"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"sfHPsw8Oyd"},{"type":"text","value":"?\nWell, we know ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"RAyeFx9J12"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"lJw87ks79m"},{"type":"text","value":" (where equality would be the case if and\nonly if we had pulled arm ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"pTQpxfUa5u"},{"type":"inlineMath","value":"k","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"kkk","key":"Al2H5PPtMi"},{"type":"text","value":" every time). So we can apply the same\ntrick as last time, where we uniform-ize across all possible values of\n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"lNCgJNY8R0"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtkN^k_tNtk","key":"paK4Uowu9r"},{"type":"text","value":":","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"WVZxTwGCTe"}],"key":"KbmNodZyza"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}","position":{"start":{"line":551,"column":1},"end":{"line":555,"column":1}},"html":"P(nt,μ~nkμkln(2/δ)2n)1tδ.\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}P(nt,μ~nkμk2nln(2/δ))1tδ.","enumerator":"3.17","key":"L1jNdp7EB4"},{"type":"paragraph","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"children":[{"type":"text","value":"In particular, since ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"c1gC92C7mP"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"qL4QLUzpda"},{"type":"text","value":", and ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"UkBNgvSsAj"},{"type":"inlineMath","value":"\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"μ~Ntkk=μ^tk\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_tμ~Ntkk=μ^tk","key":"ZcceZUPUB3"},{"type":"text","value":" by definition, we have","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"DvcRpzfeQ4"}],"key":"rs0t213lMQ"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}","position":{"start":{"line":559,"column":1},"end":{"line":563,"column":1}},"html":"P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}P(μ^tkμk2Ntkln(2t/δ))1δ where δ:=tδ.","enumerator":"3.18","key":"gKyfvDYHVA"},{"type":"paragraph","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"children":[{"type":"text","value":"This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm ","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"nbUHch0AbU"},{"type":"inlineMath","value":"k","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"html":"kkk","key":"wdmW0yxyHg"},{"type":"text","value":" would be","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"fdhfOVIImh"}],"key":"gmsGKHn7n3"},{"type":"math","value":"M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"html":"Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},Mtk:=μ^tk+2Ntkln(2t/δ),","enumerator":"3.19","key":"H5sWzWuCcX"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"where we can choose ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"UNCG0p2YOK"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"δ\\delta'δ","key":"LCMB1ehvpb"},{"type":"text","value":" depending on how tight we want the interval to be.","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"uD3CicNit9"}],"key":"uCNEguJbUW"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":571,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"text","value":"A smaller ","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"UbxakC0fn3"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"html":"δ\\delta'δ","key":"JvltZyJvEK"},{"type":"text","value":" would give us a larger and higher-confidence interval, emphasizing the exploration term.","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"mywyhmbUdE"}],"key":"SGtjQJTYgz"},{"type":"listItem","spread":true,"position":{"start":{"line":572,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"text","value":"A larger ","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"jIBxnvygx6"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"html":"δ\\delta'δ","key":"bvaLlpFhrj"},{"type":"text","value":" would give a tighter and lower-confidence interval, prioritizing the current sample averages.","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"M8cW2ngVzC"}],"key":"qLsAYyLmKH"}],"key":"tyKjdeHtKw"},{"type":"paragraph","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"We can now use this to define the UCB algorithm.","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"ll8IPsFMl2"}],"key":"ul9s95Nh8N"}],"key":"ImXc2gMDBs"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class UCB(Agent):\n def __init__(self, K: int, T: int, delta: float):\n super().__init__(K, T)\n self.delta = delta\n\n def choose_arm(self):\n return solutions.ucb_choose_arm(self)","key":"t7nNStthAI"},{"type":"output","id":"-2cqv7EcN9xD5IoOTvZOl","data":[],"key":"AQps8Qg25t"}],"data":{},"key":"Wsfp7OHovg"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"Intuitively, UCB prioritizes arms where:","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"DDIKN32pLJ"}],"key":"GrL2qIbVey"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":588,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":588,"column":1},"end":{"line":589,"column":1}},"children":[{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"gNQ2KvBdnF"},{"type":"text","value":" is large, i.e. the arm has a high sample average, and\nwe’d choose it for ","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"C1dDtMVGfs"},{"type":"emphasis","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"QCL0hKrFXk"}],"key":"IVHZ9uYP44"},{"type":"text","value":", and","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"VffNWKUCBe"}],"key":"ZT4aNc9O2i"}],"key":"uxUSLTzQXd"},{"type":"listItem","spread":true,"position":{"start":{"line":591,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":592,"column":1}},"children":[{"type":"inlineMath","value":"\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"html":"ln(2t/δ)2Ntk\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}2Ntkln(2t/δ)","key":"EsC7qDOqN4"},{"type":"text","value":" is large, i.e. we’re still\nuncertain about the arm, and we’d choose it for ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"p4MnKmVpF5"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"fhLfQDnm9k"}],"key":"GZCAMcIBfl"},{"type":"text","value":".","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"LIweoxrOeG"}],"key":"EhcQWmH66d"}],"key":"fS6BZtU8s6"}],"key":"YKl9MVYCeo"},{"type":"paragraph","position":{"start":{"line":594,"column":1},"end":{"line":595,"column":1}},"children":[{"type":"text","value":"As desired, this explores in a smarter, ","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"VUDidIJIOo"},{"type":"emphasis","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"adaptive","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"q3dUIYib0C"}],"key":"OIpTSaMLEM"},{"type":"text","value":" way compared to the\nprevious algorithms. Does it achieve lower regret?","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"NTPHetCCh7"}],"key":"k3Uts9VTQf"}],"key":"AUoh66B89D"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = UCB(mab.K, mab.T, 0.9)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"wFXCuAaLu0"},{"type":"output","id":"xdyGN9kDwWwYHY1ZyiQ1S","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"f3eb002ad30c5ba869f3a828d502f4d2","path":"/build/f3eb002ad30c5ba869f3a828d502f4d2.png"}}}],"key":"ai0ekXqTWG"}],"data":{},"key":"B3A4iqjFHx"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"children":[{"type":"text","value":"UCB regret analysis","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"hEA0tQhp6n"}],"identifier":"ucb-regret-analysis","label":"UCB regret analysis","html_id":"ucb-regret-analysis","implicit":true,"enumerator":"3.6.1","key":"TcjgXVuRmA"},{"type":"paragraph","position":{"start":{"line":605,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"First we’ll bound the regret incurred at each timestep. Then we’ll bound\nthe ","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"wD9nhV8zVC"},{"type":"emphasis","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"children":[{"type":"text","value":"total","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"ZNJUGKM8un"}],"key":"DhTekA6fgu"},{"type":"text","value":" regret across timesteps.","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"bmO2CQTZii"}],"key":"aAMKrQSUi2"},{"type":"paragraph","position":{"start":{"line":608,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"For the sake of analysis, we’ll use a slightly looser bound that applies\nacross the whole time horizon and across all arms. We’ll omit the\nderivation since it’s very similar to the above (walk through it\nyourself for practice).","position":{"start":{"line":608,"column":1},"end":{"line":608,"column":1}},"key":"Euh4om59rY"}],"key":"SiBQJuAEAf"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}","position":{"start":{"line":613,"column":1},"end":{"line":618,"column":1}},"html":"P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}P(kK,t<T.∣μ^tkμkBtk)whereBtk1δ′′:=2Ntkln(2TK/δ′′).","enumerator":"3.20","key":"LEM2OB2HKR"},{"type":"paragraph","position":{"start":{"line":620,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Intuitively, ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"YUjQ19nJ6J"},{"type":"inlineMath","value":"B^k_t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"BtkB^k_tBtk","key":"A9LHSlz7vA"},{"type":"text","value":" denotes the ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"MWg4ValfqS"},{"type":"emphasis","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"width","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"GszqzutAcc"}],"key":"VTqov60wz5"},{"type":"text","value":" of the CI for arm ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"tXc21Iyp5O"},{"type":"inlineMath","value":"k","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"kkk","key":"hD9aPeM2h2"},{"type":"text","value":" at time\n","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"p1sp91t01k"},{"type":"inlineMath","value":"t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"ttt","key":"fWzSQEDcME"},{"type":"text","value":". Then, assuming the above uniform bound holds (which occurs with\nprobability ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"sLpDlgMjqn"},{"type":"inlineMath","value":"1-\\delta''","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"1δ1-\\delta''1δ′′","key":"v4HbV0xFoE"},{"type":"text","value":"), we can bound the regret at each timestep as\nfollows:","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"yyQWWO9DO5"}],"key":"dmZGlQJmMx"},{"type":"math","value":"\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}","position":{"start":{"line":625,"column":1},"end":{"line":631,"column":1}},"html":"μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}μμatμ^tk+Btkμatμ^tat+Btatμat2Btatapplying UCB to arm ksince UCB chooses at=argk[K]maxμ^tk+Btksince μ^tatμatBtat by definition of Btat","enumerator":"3.21","key":"ygEGQfzJln"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"Summing this across timesteps gives","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"TpSotBH2o5"}],"key":"ImjcxFfPmb"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}","position":{"start":{"line":635,"column":1},"end":{"line":647,"column":1}},"html":"RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}RegretTt=0T1(Ntat)1/2n=1Tn1/2t=0T12Btat=2ln(2TK/δ′′)t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T","enumerator":"3.22","key":"UIiiqDQHYG"},{"type":"paragraph","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"children":[{"type":"text","value":"Putting everything together gives","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"key":"Nn0KdjNbd3"}],"key":"WmFYc6AeUD"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}","position":{"start":{"line":651,"column":1},"end":{"line":656,"column":1}},"html":"RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}RegretT2K2Tln(2TK/δ′′)=O~(KT)with probability 1δ′′","enumerator":"3.23","key":"pXkTZyhUq4"},{"type":"paragraph","position":{"start":{"line":658,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"In fact, we can do a more sophisticated analysis to trim off a factor of ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"sVNuGJYyFe"},{"type":"inlineMath","value":"\\sqrt{K}","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"K\\sqrt{K}K","key":"H3qGP78wCk"},{"type":"text","value":"\nand show ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"NLkqbUdW0V"},{"type":"inlineMath","value":"\\text{Regret}_T = \\tilde O(\\sqrt{TK})","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"RegretT=O~(TK)\\text{Regret}_T = \\tilde O(\\sqrt{TK})RegretT=O~(TK)","key":"pNLgDSMQcp"},{"type":"text","value":".","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"JXnPGzTEiM"}],"key":"jNxoP4RD3f"}],"key":"RjgLDabHEb"},{"type":"block","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"children":[{"type":"text","value":"Lower bound on regret (intuition)","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"NyvZ0chGkI"}],"identifier":"lower-bound-on-regret-intuition","label":"Lower bound on regret (intuition)","html_id":"lower-bound-on-regret-intuition","implicit":true,"enumerator":"3.6.2","key":"YU1FJNdBO3"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":668,"column":1}},"children":[{"type":"text","value":"Is it possible to do better than ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"I6bYSn7iL6"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"sBa0zf1RmR"},{"type":"text","value":" in general? In fact,\nno! We can show that any algorithm must incur ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"fM4cFTYBdo"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"ee6tAdpnRA"},{"type":"text","value":" regret\nin the worst case. We won’t rigorously prove this here, but the\nintuition is as follows.","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"vEmu67cjYX"}],"key":"myvRBemMvT"},{"type":"paragraph","position":{"start":{"line":670,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"The Central Limit Theorem tells us that with ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"mXrp8UmqY2"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"z2GYa241vz"},{"type":"text","value":" i.i.d. samples from\nsome distribution, we can only learn the mean of the distribution to\nwithin ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"AICvrakV8z"},{"type":"inlineMath","value":"\\Omega(1/\\sqrt{T})","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"Ω(1/T)\\Omega(1/\\sqrt{T})Ω(1/T)","key":"GMx6AStr7U"},{"type":"text","value":" (the standard deviation). Then, since we get\n","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"vdLIjUOUzG"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"NrAfvQqw2J"},{"type":"text","value":" samples spread out across the arms, we can only learn each arm’s\nmean to an even looser degree.","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"SpGdkm0eGp"}],"key":"HmSJMqU8WW"},{"type":"paragraph","position":{"start":{"line":676,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"That is, if two arms have means that are within about ","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"IdJ305IUIj"},{"type":"inlineMath","value":"1/\\sqrt{T}","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"html":"1/T1/\\sqrt{T}1/T","key":"HcSg2KUBj3"},{"type":"text","value":", we\nwon’t be able to confidently tell them apart, and will sample them about\nequally. But then we’ll incur regret","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"lVHbrBiUCI"}],"key":"bAbSBx1PJp"},{"type":"math","value":"\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"tight":"before","html":"Ω((T/2)(1/T))=Ω(T).\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).Ω((T/2)(1/T))=Ω(T).","enumerator":"3.24","key":"MNdX0X5XZt"}],"key":"goVx8jtA9t"},{"type":"block","position":{"start":{"line":681,"column":1},"end":{"line":681,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"children":[{"type":"text","value":"Thompson sampling and Bayesian bandits","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"UB2xD3HzBS"}],"label":"thompson_sampling","identifier":"thompson_sampling","html_id":"thompson-sampling","enumerator":"3.7","key":"lYq2Wl4Cm4"},{"type":"paragraph","position":{"start":{"line":686,"column":1},"end":{"line":692,"column":1}},"children":[{"type":"text","value":"So far, we’ve treated the parameters ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"fM9DXV12K8"},{"type":"inlineMath","value":"\\mu^0, \\dots, \\mu^{K-1}","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"html":"μ0,,μK1\\mu^0, \\dots, \\mu^{K-1}μ0,,μK1","key":"UG2dA3AYhN"},{"type":"text","value":" of the\nreward distributions as ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"HGLTiHPGOE"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"Gtuy6IQfj9"}],"key":"NIwZsJQevi"},{"type":"text","value":". Instead, we can take a ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"w4inkbj6tF"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"Bayesian","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"kWoWYL6CR3"}],"key":"Ozei663vPS"},{"type":"text","value":"\napproach where we treat them as random variables from some ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"SKbUDHC1pO"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"prior\ndistribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"c7OkNDSXgq"}],"key":"WhfKJoXlub"},{"type":"text","value":". Then, upon pulling an arm and observing a reward, we can\nsimply ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"ViktJDsZg3"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"condition","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"WpUKuXsn4W"}],"key":"WVtXzCLXOR"},{"type":"text","value":" on this observation to exactly describe the\n","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"tEjIOTzgt7"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"posterior distribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"f9ITvxJX2l"}],"key":"f0VfrZtt9e"},{"type":"text","value":" over the parameters. This fully describes the\ninformation we gain about the parameters from observing the reward.","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"cxJLhmlScq"}],"key":"zZZ5uDtAkn"},{"type":"paragraph","position":{"start":{"line":694,"column":1},"end":{"line":696,"column":1}},"children":[{"type":"text","value":"From this Bayesian perspective, the ","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"TK3dJsZRRk"},{"type":"strong","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"children":[{"type":"text","value":"Thompson sampling","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"RH87dvqBm8"}],"key":"dvbBJiP6XK"},{"type":"text","value":" algorithm\nfollows naturally: just sample from the distribution of the optimal arm,\ngiven the observations!","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"J38VWyj4MG"}],"key":"rUsxdUNSz8"}],"key":"kPqSufCjFT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Distribution:\n def sample(self) -> Float[Array, \" K\"]:\n \"\"\"Sample a vector of means for the K arms.\"\"\"\n ...\n\n def update(self, arm: int, reward: float):\n \"\"\"Condition on obtaining `reward` from the given arm.\"\"\"\n ...","key":"famRxoYc61"},{"type":"output","id":"SysEpzB1FUXqaioWJ7zTp","data":[],"key":"oEeKBDb76x"}],"data":{},"key":"aO6IFzgPqs"},{"type":"block","children":[],"key":"ZoM6XVuXsb"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ThompsonSampling(Agent):\n def __init__(self, K: int, T: int, prior: Distribution):\n super().__init__(K, T)\n self.distribution = prior\n\n def choose_arm(self):\n means = self.distribution.sample()\n return random_argmax(means)\n\n def update_history(self, arm: int, reward: int):\n super().update_history(arm, reward)\n self.distribution.update(arm, reward)","key":"GuzQdD6Fe9"},{"type":"output","id":"kfyy8ttTsv324_lQbNV1C","data":[],"key":"oGTVBDqGoX"}],"data":{},"key":"QqY4z9dIWF"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":724,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"In other words, we sample each arm proportionally to how likely we think\nit is to be optimal, given the observations so far. This strikes a good\nexploration-exploitation tradeoff: we explore more for arms that we’re\nless certain about, and exploit more for arms that we’re more certain\nabout. Thompson sampling is a simple yet powerful algorithm that\nachieves state-of-the-art performance in many settings.","position":{"start":{"line":724,"column":1},"end":{"line":724,"column":1}},"key":"QQGK2AtiQK"}],"key":"OqAHCeMllQ"},{"type":"proof","kind":"example","label":"bayesian_bernoulli","identifier":"bayesian_bernoulli","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bayesian Bernoulli bandit","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"LerJgv2ZX0"}],"key":"kIzaGYXbe1"},{"type":"paragraph","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"We’ve been working in the Bernoulli bandit setting, where arm ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"OlweMvbBq6"},{"type":"inlineMath","value":"k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"kkk","key":"qqs2rMkDZk"},{"type":"text","value":" yields a reward of ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"t4LjjPxVrC"},{"type":"text","value":"1","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"kkATn7uMbs"},{"type":"text","value":" with probability ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"X2ceu4FV3y"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μk\\mu^kμk","key":"iuTRvGbh9o"},{"type":"text","value":" and no reward otherwise. The vector of success probabilities ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"Rk5SYJgUOA"},{"type":"inlineMath","value":"\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μ=(μ1,,μK)\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)μ=(μ1,,μK)","key":"bVnoMKDNFa"},{"type":"text","value":" thus describes the entire MAB.","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"Qnrxvt6Y71"}],"key":"TVmbcn1W0y"},{"type":"paragraph","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"Under the Bayesian perspective, we think of ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"JIHBYWXxQv"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"kRXPDAphgT"},{"type":"text","value":" as a ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"SRMuSCliE7"},{"type":"emphasis","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"random","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"mdY74MNkOJ"}],"key":"bCsluuDvfO"},{"type":"text","value":" vector drawn from some prior distribution ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"PF8SkTjw0Z"},{"type":"inlineMath","value":"\\pi(\\boldsymbol{\\mu})","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"π(μ)\\pi(\\boldsymbol{\\mu})π(μ)","key":"MnV0rqCrX4"},{"type":"text","value":". For example, we might have ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"WKcchv5xCH"},{"type":"text","value":"π","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"MBC54fmAXU"},{"type":"text","value":" be the Uniform distribution over the unit hypercube ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"lfSYBvWWr0"},{"type":"inlineMath","value":"[0, 1]^K","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"[0,1]K[0, 1]^K[0,1]K","key":"VeO0k3gryF"},{"type":"text","value":", that is,","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"olWyPhcLF4"}],"key":"eRVsuu8f3a"},{"type":"math","value":"\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}","position":{"start":{"line":738,"column":1},"end":{"line":741,"column":1}},"html":"π(μ)={1if μ[0,1]K0otherwise\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}π(μ)={10if μ[0,1]Kotherwise","enumerator":"3.25","key":"KS9XCWmZj9"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"In this case, upon viewing some reward, we can exactly calculate the ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"O55evTsoNR"},{"type":"strong","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"posterior","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"dlTxtlP9Ww"}],"key":"DZIFmqLc2b"},{"type":"text","value":" distribution of ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"PXiiLvtHKP"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"GOsy23Rpm8"},{"type":"text","value":" using Bayes’s rule (i.e. the definition of conditional probability):","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"qKm6CkSnb3"}],"key":"V8SOEWBxW9"},{"type":"math","value":"\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}","position":{"start":{"line":745,"column":1},"end":{"line":750,"column":1}},"html":"P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.","enumerator":"3.26","key":"kaED83ipGG"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"This is the PDF of the\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"qRcSpb7RXq"},{"type":"inlineMath","value":"\\text{Beta}(1 + r_0, 1 + (1 - r_0))","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Beta(1+r0,1+(1r0))\\text{Beta}(1 + r_0, 1 + (1 - r_0))Beta(1+r0,1+(1r0))","key":"fpPNjrNwhN"},{"type":"text","value":" distribution, which is a conjugate\nprior for the Bernoulli distribution. That is, if we start with a Beta\nprior on ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"UuqcoOzZOY"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"μk\\mu^kμk","key":"RtU2tYgn7U"},{"type":"text","value":" (note that ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"m30JR1niVl"},{"type":"inlineMath","value":"\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Unif([0,1])=Beta(1,1)\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)Unif([0,1])=Beta(1,1)","key":"pr8IOdkq6P"},{"type":"text","value":"),\nthen the posterior, after conditioning on samples from\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"l9nPQLiXB1"},{"type":"inlineMath","value":"\\text{Bern}(\\mu^k)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Bern(μk)\\text{Bern}(\\mu^k)Bern(μk)","key":"XR0zdyDCko"},{"type":"text","value":", will also be Beta. This is a very convenient\nproperty, since it means we can simply update the parameters of the Beta\ndistribution upon observing a reward, rather than having to recompute\nthe entire posterior distribution from scratch.","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"Hhf7eXSo85"}],"key":"qWPXZMa0Ph"}],"enumerator":"3.3","html_id":"bayesian-bernoulli","key":"BAzHFFFQOz"}],"key":"YeLiqIiODM"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Beta(Distribution):\n def __init__(self, K: int, alpha: int = 1, beta: int = 1):\n self.alphas = np.full(K, alpha)\n self.betas = np.full(K, beta)\n\n def sample(self):\n return np.random.beta(self.alphas, self.betas)\n\n def update(self, arm: int, reward: int):\n self.alphas[arm] += reward\n self.betas[arm] += 1 - reward","key":"IwWcISS7vp"},{"type":"output","id":"s5rMM3o7TiGu7cfFzH009","data":[],"key":"khEQbSMRSb"}],"data":{},"key":"kP5I5Gf6vn"},{"type":"block","children":[],"key":"abN45SQMEN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"beta_distribution = Beta(mab.K)\nagent = ThompsonSampling(mab.K, mab.T, beta_distribution)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"XMoRthHhI8"},{"type":"output","id":"b5BfByop6C7XAlIawCwr0","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"991419959ab213822fb1c34db8883adb","path":"/build/991419959ab213822fb1c34db8883adb.png"}}}],"key":"EQ5TWZ78QF"}],"data":{},"key":"UKhfpu9hge"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"It turns out that asymptotically, Thompson sampling is optimal in the\nfollowing sense. ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"kKgvkTkEr7"},{"type":"cite","kind":"narrative","label":"lai_asymptotically_1985","identifier":"lai_asymptotically_1985","children":[{"type":"text","value":"Lai & Robbins (1985)","key":"C56GFz09Lh"}],"enumerator":"2","key":"DoRm8bvRIx"},{"type":"text","value":" prove an\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"PksSLFAB6G"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"instance-dependent","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"OOTyxDBv4i"}],"key":"lV61cK6cQk"},{"type":"text","value":" lower bound that says for ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"MZyQ0iSp3c"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"B368Bg8xin"}],"key":"GO22Wyujoa"},{"type":"text","value":" bandit algorithm,","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"tAiVSNvc4p"}],"key":"UVBafBWhBU"},{"type":"math","value":"\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"lim infTE[NTk]ln(T)1KL(μkμ)\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}Tliminfln(T)E[NTk]KL(μkμ)1","enumerator":"3.27","key":"bdxYuZlzFX"},{"type":"paragraph","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"key":"zrfzusB9wn"}],"key":"PKlnInBiSw"},{"type":"math","value":"\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}","position":{"start":{"line":792,"column":1},"end":{"line":792,"column":1}},"html":"KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}KL(μkμ)=μklnμμk+(1μk)ln1μ1μk","enumerator":"3.28","key":"TC3VWZksA4"},{"type":"paragraph","position":{"start":{"line":794,"column":1},"end":{"line":798,"column":1}},"children":[{"type":"text","value":"measures the ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"DJ93yuts5m"},{"type":"strong","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"jVJJKizQIE"}],"key":"M51kKN4ETX"},{"type":"text","value":" from the Bernoulli\ndistribution with mean ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"jQ2M1LbITA"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μk\\mu^kμk","key":"pwEzX2Rn9s"},{"type":"text","value":" to the Bernoulli distribution with mean\n","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"SWZyIDhs3b"},{"type":"inlineMath","value":"\\mu^\\star","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μ\\mu^\\starμ","key":"xbU7W1NHgH"},{"type":"text","value":". It turns out that Thompson sampling achieves this lower\nbound with equality! That is, not only is the error ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"YeyQ8dzgnz"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"rate","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"rwJH9i2g1T"}],"key":"BKjbWoNUaE"},{"type":"text","value":" optimal, but\nthe ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"ttmIJ6pdxP"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"constant factor","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"dt4G7FDCKz"}],"key":"ooVYYmDWJg"},{"type":"text","value":" is optimal as well.","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"grNbHs5F6e"}],"key":"xmc8fDx2g5"}],"key":"mbjffYIa2s"},{"type":"block","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"Contextual bandits","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"bJvQ856FD2"}],"identifier":"contextual-bandits","label":"Contextual bandits","html_id":"contextual-bandits","implicit":true,"enumerator":"3.8","key":"oncyOfnMyO"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"pK3pNhPeys"}],"key":"DnwMvGfnUx"},{"type":"paragraph","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"children":[{"type":"text","value":"This content is advanced material taught at the end of the course.","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"key":"oi8IVrqslu"}],"key":"RrSNDdURAl"}],"key":"NwWmAgFY9v"},{"type":"paragraph","position":{"start":{"line":808,"column":1},"end":{"line":814,"column":1}},"children":[{"type":"text","value":"In the above MAB environment, the reward distributions of the arms\nremain constant. However, in many real-world settings, we might receive\nadditional information that affects these distributions. For example, in\nthe online advertising case where each arm corresponds to an ad we could\nshow the user, we might receive information about the user’s preferences\nthat changes how likely they are to click on a given ad. We can model\nsuch environments using ","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"hcCfm63seh"},{"type":"strong","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"children":[{"type":"text","value":"contextual bandits","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"z1YG4dMztH"}],"key":"klw4jhLGD6"},{"type":"text","value":".","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"QsKeZX5YF9"}],"key":"pEw5OehPNX"},{"type":"proof","kind":"definition","label":"contextual_bandit","identifier":"contextual_bandit","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contextual bandit","position":{"start":{"line":816,"column":1},"end":{"line":816,"column":1}},"key":"iELROxG0J6"}],"key":"X4gbuJ8lU1"},{"type":"paragraph","position":{"start":{"line":819,"column":1},"end":{"line":824,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"QSjH3VhTVs"},{"type":"inlineMath","value":"t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ttt","key":"ABuJMW11jJ"},{"type":"text","value":", a new ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"COE0krw8LV"},{"type":"emphasis","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"children":[{"type":"text","value":"context","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"Enot1NTIcF"}],"key":"pH6okJcvfO"},{"type":"text","value":"\n","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"oE8Mlxai3k"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"xtx_txt","key":"hKpDdGHpzT"},{"type":"text","value":" is drawn from some distribution ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"kQEfVmsRVw"},{"type":"inlineMath","value":"\\nu_{\\text{x}}","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"νx\\nu_{\\text{x}}νx","key":"VPqkPAP5Wa"},{"type":"text","value":". The learner gets\nto observe the context, and choose an action ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"B2FMIoSSD6"},{"type":"inlineMath","value":"a_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ata_tat","key":"hWOJHmUQs9"},{"type":"text","value":" according to some\ncontext-dependent policy ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"ADVL1CeePj"},{"type":"inlineMath","value":"\\pi_t(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"πt(xt)\\pi_t(x_t)πt(xt)","key":"KancSarwre"},{"type":"text","value":". Then, the learner observes the\nreward from the chosen arm ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"RTz5gukvTO"},{"type":"inlineMath","value":"r_t \\sim \\nu^{a_t}(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"rtνat(xt)r_t \\sim \\nu^{a_t}(x_t)rtνat(xt)","key":"WWvRDUtZh0"},{"type":"text","value":". The reward\ndistribution also depends on the context.","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"vRhN4u7DK2"}],"key":"VnqM3nIFn6"}],"enumerator":"3.2","html_id":"contextual-bandit","key":"zI16XAM1Ru"}],"key":"d12EJN1tH8"},{"type":"block","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":829,"column":1},"end":{"line":831,"column":1}},"children":[{"type":"text","value":"Assuming our context is ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"g5elsoAcf6"},{"type":"emphasis","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"children":[{"type":"text","value":"discrete","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"MsU1IAjS7Y"}],"key":"aVWEbZHK7V"},{"type":"text","value":", we can just perform the same\nalgorithms, treating each context-arm pair as its own arm. This gives us\nan enlarged MAB of ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"TKFZCvmrnd"},{"type":"inlineMath","value":"K |\\mathcal{X}|","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"html":"KXK |\\mathcal{X}|KX","key":"Jx7sETnmCI"},{"type":"text","value":" arms.","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"QH7bEzYZbb"}],"key":"vErOKy46PV"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"PstyhTVpuN"}],"key":"iEButHSGqG"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"Write down the UCB algorithm for this enlarged MAB. That is, write an\nexpression for ","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"bEHCxUYuzA"},{"type":"inlineMath","value":"\\pi_t(x_t) = \\arg\\max_a \\dots","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"html":"πt(xt)=argmaxa\\pi_t(x_t) = \\arg\\max_a \\dotsπt(xt)=argmaxa","key":"LGrPPS7Gn9"},{"type":"text","value":".","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"Y0JeSsyrPJ"}],"key":"IOmEa1RHnI"}],"key":"k3YQEiGXjD"},{"type":"paragraph","position":{"start":{"line":838,"column":1},"end":{"line":844,"column":1}},"children":[{"type":"text","value":"Recall that running UCB for ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"UgNkJuUuyY"},{"type":"inlineMath","value":"T","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"TTT","key":"ZYfsPSad99"},{"type":"text","value":" timesteps on an MAB with ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"xCN9IFY1Vq"},{"type":"inlineMath","value":"K","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"KKK","key":"OBNzTu1MQS"},{"type":"text","value":" arms\nachieves a regret bound of ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"omomY8Mq2t"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{TK})O~(TK)","key":"Bdu47Z68Ch"},{"type":"text","value":". So in this problem,\nwe would achieve regret ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"KyiHXz5fOE"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TKX)\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})O~(TKX)","key":"fGvZqOWMod"},{"type":"text","value":" in the\ncontextual MAB, which has a polynomial dependence on ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"DzrG9F6xCe"},{"type":"inlineMath","value":"|\\mathcal{X}|","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"X|\\mathcal{X}|X","key":"UjhTaaJ5yv"},{"type":"text","value":".\nBut in a situation where we have large, or even infinitely many\ncontexts, e.g. in the case where our context is a continuous value, this\nbecomes intractable.","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"flWfih7tRt"}],"key":"sBzggjuMTg"},{"type":"paragraph","position":{"start":{"line":846,"column":1},"end":{"line":850,"column":1}},"children":[{"type":"text","value":"Note that this “enlarged MAB” treats the different contexts as entirely\nunrelated to each other, while in practice, often contexts are ","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"GLVv6kw0Op"},{"type":"emphasis","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"children":[{"type":"text","value":"related","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"ze0fFdEaIC"}],"key":"szqpsMIfrm"},{"type":"text","value":"\nto each other in some way: for example, we might want to advertise\nsimilar products to users with similar preferences. How can we\nincorporate this structure into our solution?","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"wY1ovzroy2"}],"key":"gff7lKg7hm"}],"key":"BrBQILZ5Sp"},{"type":"block","position":{"start":{"line":852,"column":1},"end":{"line":852,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"children":[{"type":"text","value":"Linear contextual bandits","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"hWvldQkDel"}],"label":"lin_ucb","identifier":"lin_ucb","html_id":"lin-ucb","enumerator":"3.8.1","key":"WLH9obzJw0"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"We want to model the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"sufaGCZWEz"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"mean reward","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"REeLwJhim4"}],"key":"k8R2DBALUu"},{"type":"text","value":" of arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"JnbxUxtmv3"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"zhZaomZDjQ"},{"type":"text","value":" as a function of the\ncontext, i.e. ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"te3zIUwTeR"},{"type":"inlineMath","value":"\\mu^k(x)","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)\\mu^k(x)μk(x)","key":"eblzazw6BQ"},{"type":"text","value":". One simple model is the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"FtgN4ZZbHv"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"vof5zLmKyB"}],"key":"Hny8Q3Xktp"},{"type":"text","value":" one:\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"hXkCrb54n6"},{"type":"inlineMath","value":"\\mu^k(x) = x^\\top \\theta^k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)=xθk\\mu^k(x) = x^\\top \\theta^kμk(x)=xθk","key":"FzZm1UX63m"},{"type":"text","value":", where ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"bvrXtg0P9c"},{"type":"inlineMath","value":"x \\in \\mathcal{X} = \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"xX=Rdx \\in \\mathcal{X} = \\mathbb{R}^dxX=Rd","key":"o3R1O8CBw6"},{"type":"text","value":" and\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"Aa6YmjH28Z"},{"type":"inlineMath","value":"\\theta^k \\in \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"θkRd\\theta^k \\in \\mathbb{R}^dθkRd","key":"EJN2He2vra"},{"type":"text","value":" describes a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"jBOlBsUoqX"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"feature direction","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"dfyBNXD7hw"}],"key":"USJYnILtRm"},{"type":"text","value":" for arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"LtVNUfm9gt"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"G0BSxJJTKL"},{"type":"text","value":". Recall\nthat ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"eHaVhCu7fp"},{"type":"strong","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"nldWF86bax"}],"key":"Yv0lxAmaod"},{"type":"text","value":" gives us a way to estimate a conditional\nexpectation from samples: We learn a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"iGj0xfFqmV"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"cmGe8NwXHs"}],"key":"lbBORs06O3"},{"type":"text","value":" estimator from the\ntimesteps where arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"iDcg64Z4tL"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"vbsfTaO9F5"},{"type":"text","value":" was selected:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"bssOknpZDQ"}],"key":"aW2QfidbNP"},{"type":"math","value":"\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"tight":true,"html":"θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.θ^tk=argθRdmin{i[t]:ai=k}(rixiθ)2.","enumerator":"3.29","key":"GS6VqKQmvI"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"This has the closed-form solution known as the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"M7IlKppcM5"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"ordinary least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ZrVqfLRksa"}],"key":"ngxBDV7Z3K"},{"type":"text","value":"\n(OLS) estimator:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"hguwVH8vcV"}],"key":"SZE7efMS6r"},{"type":"math","value":"\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}","label":"ols_bandit","identifier":"ols_bandit","html":"θ^tk=(Atk)1{i[t]:ai=k}xiriwhereAtk={i[t]:ai=k}xixi.\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}θ^tkwhereAtk=(Atk)1{i[t]:ai=k}xiri={i[t]:ai=k}xixi.","enumerator":"3.30","html_id":"ols-bandit","key":"J39Nyb1tvy"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"We can now apply the UCB algorithm in this environment in order to\nbalance ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"VTIRHmzLiI"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"KbbIewPmhr"}],"key":"ap5EoHy7BI"},{"type":"text","value":" of new arms and ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"iM63qF9INE"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"zjizvtdgBi"}],"key":"on3ewRfpum"},{"type":"text","value":" of arms that we\nbelieve to have high reward. But how should we construct the upper\nconfidence bound? Previously, we treated the pulls of an arm as i.i.d.\nsamples and used Hoeffding’s inequality to bound the distance of the\nsample mean, our estimator, from the true mean. However, now our\nestimator is not a sample mean, but rather the OLS estimator above ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"x7p1kgj8s8"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"vnrNxtOWYb"},{"type":"text","value":"3.30","key":"xAM990WjWe"},{"type":"text","value":")","key":"KoqpbIiQmk"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"gvuf6Xhmvd"},{"type":"text","value":". Instead, we’ll use ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"W2XKlg69XE"},{"type":"strong","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"Chebyshev’s\ninequality","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"Z49fFv4kZT"}],"key":"Jcxep6k4LN"},{"type":"text","value":" to construct an upper confidence bound.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"I6O8lLR4gj"}],"key":"to4gBqgSxo"},{"type":"proof","kind":"theorem","label":"chebyshev","identifier":"chebyshev","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Chebyshev’s inequality","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"key":"OMUnr6JGH4"}],"key":"JEzLkkhSdI"},{"type":"paragraph","position":{"start":{"line":889,"column":1},"end":{"line":891,"column":1}},"children":[{"type":"text","value":"For a random variable ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"AZis6nVr9I"},{"type":"inlineMath","value":"Y","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"YYY","key":"qWLDoMMCG3"},{"type":"text","value":" such that\n","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"ksSKlqlAqO"},{"type":"inlineMath","value":"\\E Y = 0","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY=0\\E Y = 0EY=0","key":"EvLR7ekDtV"},{"type":"text","value":" and ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"v1x7GwhZw6"},{"type":"inlineMath","value":"\\E Y^2 = \\sigma^2","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY2=σ2\\E Y^2 = \\sigma^2EY2=σ2","key":"su7I4VS1Ts"},{"type":"text","value":",","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"sZYkSlool0"}],"key":"n6wHeo4rvu"},{"type":"math","value":"|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"tight":"before","html":"Yβσwith probability11β2|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}Yβσwith probability1β21","enumerator":"3.31","key":"NoqOWSV9HZ"}],"enumerator":"3.3","html_id":"chebyshev","key":"FybXKXCSbO"},{"type":"paragraph","position":{"start":{"line":894,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"Since the OLS estimator is known to be unbiased (try proving this\nyourself), we can apply Chebyshev’s inequality to\n","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"WfTl7KYJhq"},{"type":"inlineMath","value":"x_t^\\top (\\hat \\theta_t^k - \\theta^k)","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"html":"xt(θ^tkθk)x_t^\\top (\\hat \\theta_t^k - \\theta^k)xt(θ^tkθk)","key":"NfEfcssPxJ"},{"type":"text","value":":","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"JovScqgU9k"}],"key":"gpM7XmpIJC"},{"type":"math","value":"\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":900,"column":1}},"html":"xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}xtθkxtθ^tk+βxt(Atk)1xtwith probability1β21","enumerator":"3.32","key":"rCqD4kFogw"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"PONDjfwg0K"}],"key":"Tz3Z01EX59"},{"type":"paragraph","position":{"start":{"line":903,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"We haven’t explained why ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"C8VoAySsMU"},{"type":"inlineMath","value":"x_t^\\top (A_t^k)^{-1} x_t","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xt(Atk)1xtx_t^\\top (A_t^k)^{-1} x_txt(Atk)1xt","key":"tmEypczjkW"},{"type":"text","value":" is the correct\nexpression for the variance of ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"R142YgYHgH"},{"type":"inlineMath","value":"x_t^\\top \\hat \\theta_t^k","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xtθ^tkx_t^\\top \\hat \\theta_t^kxtθ^tk","key":"Kc4UTca9l8"},{"type":"text","value":". This result\nfollows from some algebra on the definition of the OLS estimator ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"j05uYewFF8"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"SUHI474dXi"},{"type":"text","value":"3.30","key":"AcXfCRwHw8"},{"type":"text","value":")","key":"vQsWBvVb0Y"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"snP7RHVMjc"},{"type":"text","value":".","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"oVdv1N8Caq"}],"key":"n6LKTCnKqc"}],"key":"cb7L2OlJkd"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"The first term is exactly our predicted reward ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"cobAoNoj4m"},{"type":"inlineMath","value":"\\hat \\mu^k_t(x_t)","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"μ^tk(xt)\\hat \\mu^k_t(x_t)μ^tk(xt)","key":"eJC8i9UiQN"},{"type":"text","value":". To\ninterpret the second term, note that","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Op6GUlx7CR"}],"key":"kZmfXSgAJq"},{"type":"math","value":"x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,xt(Atk)1xt=Ntk1xt(Σtk)1xt,","enumerator":"3.33","key":"bTEOEfvEir"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"VRNWkIovPe"}],"key":"HvlWxaIh4X"},{"type":"math","value":"\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"Σtk=1Ntk{i[t]:ai=k}xixi\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\topΣtk=Ntk1{i[t]:ai=k}xixi","enumerator":"3.34","key":"itZGTBF0pe"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"is the empirical covariance matrix of the contexts (assuming that the\ncontext has mean zero). That is, the learner is encouraged to choose\narms when ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"gWHfEcPbyh"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"xtx_txt","key":"KUA5rCjzRo"},{"type":"text","value":" is ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"E8cDMU3Z1Z"},{"type":"emphasis","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"children":[{"type":"text","value":"not aligned","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"A6mul87Acd"}],"key":"Y9xkKBVmZf"},{"type":"text","value":" with the data seen so far, or if arm\n","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"FyWV8c6mLP"},{"type":"inlineMath","value":"k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"kkk","key":"J2OUycvBsM"},{"type":"text","value":" has not been explored much and so ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"rjkB1Raknn"},{"type":"inlineMath","value":"N_t^k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"NtkN_t^kNtk","key":"QEFoAb1Um1"},{"type":"text","value":" is small.","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"NKiniqhBej"}],"key":"dFsvuiJ0Fe"},{"type":"paragraph","position":{"start":{"line":918,"column":1},"end":{"line":919,"column":1}},"children":[{"type":"text","value":"We can now substitute these quantities into UCB to get the ","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"sm6zOdcZrJ"},{"type":"strong","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"LinUCB","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"WVJXYIzQva"}],"key":"bU5cPWfVxg"},{"type":"text","value":"\nalgorithm:","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"YgkUPjX44K"}],"key":"QsgaRYrMI1"}],"key":"LQMMVluGww"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class LinUCBPseudocode(Agent):\n def __init__(\n self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]\n ):\n super().__init__(K, T)\n self.lam = lam\n self.get_c = get_c\n self.contexts = [None for _ in range(K)]\n self.A = np.repeat(lam * np.eye(D)[...], K)\n self.targets = np.zeros(K, D)\n self.w = np.zeros(K, D)\n\n def choose_arm(self, context: Float[Array, \" D\"]):\n c = self.get_c(self.count)\n scores = self.w @ context + c * np.sqrt(\n context.T @ np.linalg.solve(self.A, context)\n )\n return random_argmax(scores)\n\n def update_history(self, context: Float[Array, \" D\"], arm: int, reward: int):\n self.A[arm] += np.outer(context, context)\n self.targets[arm] += context * reward\n self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])","key":"ZaiBa0u2sr"},{"type":"output","id":"B2cmYXXqy5mVx_ZwnrjrH","data":[],"key":"CPUifkkhtB"}],"data":{},"key":"vfYSwzQzPv"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"XCtzZV1xZq"}],"key":"PfboIOb8yT"},{"type":"paragraph","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Note that the matrix ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"jrEQCjM3iZ"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"ELC8CjmUp8"},{"type":"text","value":" above might not be invertible. When does this occur? One way to address this is to include a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"BPUE96t4rp"},{"type":"inlineMath","value":"\\lambda I","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"λI\\lambda IλI","key":"zW0GnwYNus"},{"type":"text","value":" regularization term to ensure that ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"jq2aK5fSTz"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"au4sgBxxrL"},{"type":"text","value":" is invertible. This is equivalent to solving a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"IAiiivlvTL"},{"type":"emphasis","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"ridge regression","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"zCb75Io4Qk"}],"key":"Yk1mWwdrVo"},{"type":"text","value":" problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"CioLfH1W9j"}],"key":"HYIioaWFzX"}],"key":"VMKzWdSmti"}],"key":"spv12L3436"},{"type":"block","position":{"start":{"line":951,"column":1},"end":{"line":951,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":953,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"inlineMath","value":"c_t","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"ctc_tct","key":"DfrXRcWcK0"},{"type":"text","value":" is similar to the ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"X4ya7idBE7"},{"type":"inlineMath","value":"\\log (2t/\\delta')","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"log(2t/δ)\\log (2t/\\delta')log(2t/δ)","key":"QqTDus35RZ"},{"type":"text","value":" term of UCB: It controls the\nwidth of the confidence interval. Here, we treat it as a tunable\nparameter, though in a theoretical analysis, it would depend on ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"tyjdtDmEEQ"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"AtkA_t^kAtk","key":"shYVTEV42x"},{"type":"text","value":"\nand the probability ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"eFI1mJCtlm"},{"type":"text","value":"δ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"E0TytuGqnU"},{"type":"text","value":" with which the bound holds.","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"ClfCmIb99s"}],"key":"qysGMeKeCz"},{"type":"paragraph","position":{"start":{"line":958,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Using similar tools for UCB, we can also prove an ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"twTP609O4x"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{T})","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"html":"O~(T)\\tilde{O}(\\sqrt{T})O~(T)","key":"suf2BKo8Fk"},{"type":"text","value":"\nregret bound. The full details of the analysis can be found in Section 3 of ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"V01Enigfj6"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"jE2BfrP134"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"Op8KifyrcN"}],"key":"BerQgFH3cS"},{"type":"text","value":" (2022)","key":"ORU6zZ4LXH"}],"enumerator":"3","key":"NIWL0JvWDk"},{"type":"text","value":".","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"zQ7YVzddXa"}],"key":"FK18bMmaON"},{"type":"heading","depth":2,"position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"key":"Kae9betUBI"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"3.9","key":"yJCYYaWbaQ"},{"type":"paragraph","position":{"start":{"line":963,"column":1},"end":{"line":964,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored the ","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"cxce0ty57t"},{"type":"strong","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"To6H16zn2R"}],"key":"iHtInqjRgQ"},{"type":"text","value":" setting for analyzing sequential decision-making in an unknown environment.","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"nX0rTXwo4J"}],"key":"m5DhZHYYvw"}],"key":"CuXXPOrdZp"}],"key":"hwIClPiZug"},"references":{"cite":{"order":["vershynin_high-dimensional_2018","lai_asymptotically_1985","agarwal_reinforcement_2022"],"data":{"vershynin_high-dimensional_2018":{"label":"vershynin_high-dimensional_2018","enumerator":"1","html":"Vershynin, R. (2018). High-Dimensional Probability: An Introduction with Applications in Data Science. Cambridge University Press."},"lai_asymptotically_1985":{"label":"lai_asymptotically_1985","enumerator":"2","doi":"10.1016/0196-8858(85)90002-8","html":"Lai, T. L., & Robbins, H. (1985). Asymptotically Efficient Adaptive Allocation Rules. Advances in Applied Mathematics, 6(1), 4–22. 10.1016/0196-8858(85)90002-8","url":"https://doi.org/10.1016/0196-8858(85)90002-8"},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"3","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."}}}},"footer":{"navigation":{"prev":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"cb8437494713e13080ce9e296ca5fbb4d04ebda213c523132d19db6324b795e6","slug":"bandits","location":"/bandits.md","dependencies":[],"frontmatter":{"title":"3 Multi-Armed Bandits","numbering":{"all":{"enabled":true},"enumerator":{"template":"3.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"bandits.md","url":"/build/bandits-edc5c0bbc4c299ec710273a0eb78717a.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"nWSeY0x6gC"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"3.1","key":"GUljouEZrf"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"GB2yISHlNa"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"EPmCrARmmc"}],"key":"Eoc4K7asrv"},{"type":"text","value":" (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making.\nIn this setting, an agent repeatedly chooses from a fixed set of actions, called ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ZDdZGnGukc"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"arms","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"mHZbbwiqVp"}],"key":"wGvcOKRcV8"},{"type":"text","value":", each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"CxqQJZjon6"}],"key":"HLoKqUVs3S"},{"type":"comment","value":" \n| States | Actions | Rewards |\n| :----: | :-----: | :---------------------------------: |\n| None | Finite | $\\mathcal{A} \\to \\triangle([0, 1])$ |\n","key":"aPZlUHuv99"},{"type":"paragraph","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"In particular, we’ll spend a lot of time discussing the ","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"WM2RGUd4Ue"},{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Exploration-Exploitation Tradeoff","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"NuKWfSpfjD"}],"key":"rjO4ypYqUm"},{"type":"text","value":": should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"mg04WyXOQa"}],"key":"yiSo3Nfsy8"},{"type":"proof","kind":"example","label":"advertising","identifier":"advertising","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Online advertising","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"bCXq5PqUHS"}],"key":"EZSY9N9EVF"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"Let’s suppose you, the agent, are an advertising company. You have ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"aoP6RJaHOI"},{"type":"inlineMath","value":"K","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"KKK","key":"VfyuHVwY6q"},{"type":"text","value":" different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"dun5C6pUbw"},{"type":"text","value":"1","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"G8yjy3ECU5"},{"type":"text","value":" reward if the user clicks the ad, and ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"ZEPJhspxM1"},{"type":"text","value":"0","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"KHR196RuF7"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"R3G9r0xz8c"},{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"TS6QIkuAzL"}],"key":"K1zZ9GDvFh"},{"type":"text","value":" associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user.","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"ZO8G9DxDM2"}],"key":"Ndnxc9VBtZ"}],"enumerator":"3.1","html_id":"advertising","key":"Ss5xrFJ7ZY"},{"type":"proof","kind":"example","label":"clinical_trials","identifier":"clinical_trials","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Clinical trials","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"gJyCZT7ymr"}],"key":"fEfTnS7Muv"},{"type":"paragraph","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Suppose you’re a pharmaceutical company, and you’re testing a new drug. You have ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"axjjzNWrJa"},{"type":"inlineMath","value":"K","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"html":"KKK","key":"xzRnT8w3TZ"},{"type":"text","value":" different dosages of the drug that you can administer to patients. You receive ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"Ib4HaHOZjI"},{"type":"text","value":"1","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"YjQxOtv8aL"},{"type":"text","value":" reward if the patient recovers, and ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"qHV3meXXOW"},{"type":"text","value":"0","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"lFnKvx4wBa"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"DNPeiBheaW"},{"type":"emphasis","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"AOZg3VWa3Q"}],"key":"mSAR5f3nkz"},{"type":"text","value":" associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"wpusxwhV36"}],"key":"HrzzBQKOMG"}],"enumerator":"3.2","html_id":"clinical-trials","key":"qSFOf9nzGd"},{"type":"paragraph","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"pqRvfB6puo"}],"key":"UtQqqz94w7"}],"key":"E4D8R3N1DD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nimport numpy as np\nimport latexify\nfrom typing import Callable, Union\nimport matplotlib.pyplot as plt\n\nimport solutions.bandits as solutions\n\nnp.random.seed(184)\n\ndef random_argmax(ary: Array) -> int:\n \"\"\"Take an argmax and randomize between ties.\"\"\"\n max_idx = np.flatnonzero(ary == ary.max())\n return np.random.choice(max_idx).item()\n\n\n# used as decorator\nlatex = latexify.algorithmic(\n prefixes={\"mab\"},\n identifiers={\"arm\": \"a_t\", \"reward\": \"r\", \"means\": \"mu\"},\n use_math_symbols=True,\n escape_underscores=False,\n)","key":"GFHLFuZ7cC"},{"type":"output","id":"unLuOsRZ9M9HFnMHyjSZb","data":[],"key":"eX2DIfu9Wz"}],"data":{},"key":"CA3mKlHPID"},{"type":"block","position":{"start":{"line":72,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"proof","kind":"remark","label":"multi-armed","identifier":"multi-armed","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Namesake","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"key":"TUQ0THHfEI"}],"key":"IADLnxr9DT"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"The name “multi-armed bandits” comes from slot machines in casinos, which are often called “one-armed bandits” since they have one arm (the lever) and take money from the player.","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"phHGMkrwm7"}],"key":"dihXzZjlli"}],"enumerator":"3.1","html_id":"multi-armed","key":"DM1Ad47T3W"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"hTFWm0J4kP"},{"type":"inlineMath","value":"K","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"KKK","key":"qJDrkAAVK7"},{"type":"text","value":" denote the number of arms. We’ll label them ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"x2IDbGZ9Gy"},{"type":"inlineMath","value":"0, \\dots, K-1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"0,,K10, \\dots, K-10,,K1","key":"ufEeAl2WF3"},{"type":"text","value":" and use ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"Kajx0LhIHT"},{"type":"emphasis","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"superscripts","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"o8VE0Y5pVZ"}],"key":"WIkMatWtnL"},{"type":"text","value":" to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"acjvZdz4ea"},{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Bernoulli bandit","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"TZgZO9lqgC"}],"key":"e6NuKE2iNX"},{"type":"text","value":" setting from the examples above, where arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"GhMC6ziu7Z"},{"type":"inlineMath","value":"k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"kkk","key":"SEAEpIJgv4"},{"type":"text","value":" either returns reward ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"kRPc8GbEfA"},{"type":"text","value":"1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"iLeyoOknLH"},{"type":"text","value":" with probability ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"aAsP3KWItX"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"μk\\mu^kμk","key":"lso4pijEQ7"},{"type":"text","value":" or ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"iOOm5xXlf8"},{"type":"text","value":"0","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"l5Awnpgk7U"},{"type":"text","value":" otherwise. The agent gets to pull an arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"YNX6wZD4yJ"},{"type":"inlineMath","value":"T","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"TTT","key":"K5MloImxKQ"},{"type":"text","value":" times in total. We can formalize the Bernoulli bandit in the following Python code:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"u9NkZUeoqb"}],"key":"TTvMZXmaVG"}],"key":"ve4kzOb67A"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MAB:\n \"\"\"\n The Bernoulli multi-armed bandit environment.\n\n :param means: the means (success probabilities) of the reward distributions for each arm\n :param T: the time horizon\n \"\"\"\n\n def __init__(self, means: Float[Array, \" K\"], T: int):\n assert all(0 <= p <= 1 for p in means)\n self.means = means\n self.T = T\n self.K = self.means.size\n self.best_arm = random_argmax(self.means)\n\n def pull(self, k: int) -> int:\n \"\"\"Pull the `k`-th arm and sample from its (Bernoulli) reward distribution.\"\"\"\n reward = np.random.rand() < self.means[k].item()\n return +reward","key":"o0SYIBcrrb"},{"type":"output","id":"PXfSw9Q5kCjIAXu2hWz8U","data":[],"key":"BoxeOqKpcq"}],"data":{},"key":"GBl3Yuqx8A"},{"type":"block","children":[],"key":"X4HsQlXVlr"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)","key":"zHSCvDg7nu"},{"type":"output","id":"FjqH08-BJrKWcPaTfGNx8","data":[],"key":"Nrose8kX9g"}],"data":{},"key":"YitZpqIgG9"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"In pseudocode, the agent’s interaction with the MAB environment can be\ndescribed by the following process:","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"HY9b0uPTib"}],"key":"DgDDNfUClq"}],"key":"JSvpFCkUuU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"@latex\ndef mab_loop(mab: MAB, agent: \"Agent\") -> int:\n for t in range(mab.T):\n arm = agent.choose_arm() # in 0, ..., K-1\n reward = mab.pull(arm)\n agent.update_history(arm, reward)\n\n\nmab_loop","key":"rDDcgp0PLI"},{"type":"output","id":"MrL1osEepjUJwt8n8Q2zi","data":[{"output_type":"execute_result","execution_count":4,"metadata":{},"data":{"text/plain":{"content":"","content_type":"text/plain"},"text/latex":{"content":"$ \\begin{array}{l} \\mathbf{function} \\ \\mathrm{mab\\_loop}(\\mathrm{mab}, \\mathrm{agent}) \\\\ \\hspace{1em} \\mathbf{for} \\ t \\in \\mathrm{range} \\mathopen{}\\left( T \\mathclose{}\\right) \\ \\mathbf{do} \\\\ \\hspace{2em} \\mathrm{a\\_t} \\gets \\mathrm{agent}.\\mathrm{choose\\_arm} \\mathopen{}\\left( \\mathclose{}\\right) \\\\ \\hspace{2em} r \\gets \\mathrm{pull} \\mathopen{}\\left( \\mathrm{a\\_t} \\mathclose{}\\right) \\\\ \\hspace{2em} \\mathrm{agent}.\\mathrm{update\\_history} \\mathopen{}\\left( \\mathrm{a\\_t}, r \\mathclose{}\\right) \\\\ \\hspace{1em} \\mathbf{end \\ for} \\\\ \\mathbf{end \\ function} \\end{array} $","content_type":"text/latex"}}}],"key":"WHSYzxuOVn"}],"data":{},"key":"ug3WYMcV6N"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"dtTOOAZNUY"},{"type":"inlineCode","value":"Agent","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"JdeJrxJBh6"},{"type":"text","value":" class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"p0W1LVF6Wj"},{"type":"inlineMath","value":"\\mathbb{N}^{K \\times 2}","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"NK×2\\mathbb{N}^{K \\times 2}NK×2","key":"A1pya8l9Es"},{"type":"text","value":" array.","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"FC5uzHudiG"}],"key":"kz2o6tPrnK"}],"key":"VqnxzViznl"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Agent:\n def __init__(self, K: int, T: int):\n \"\"\"The MAB agent that decides how to choose an arm given the past history.\"\"\"\n self.K = K\n self.T = T\n self.rewards = [] # for plotting\n self.choices = []\n self.history = np.zeros((K, 2), dtype=int)\n\n def choose_arm(self) -> int:\n \"\"\"Choose an arm of the MAB. Algorithm-specific.\"\"\"\n ...\n\n def count(self) -> int:\n \"\"\"The number of pulls made. Also the current step index.\"\"\"\n return len(self.rewards)\n\n def update_history(self, arm: int, reward: int):\n self.rewards.append(reward)\n self.choices.append(arm)\n self.history[arm, reward] += 1","key":"ncTznhLvDg"},{"type":"output","id":"IdwxoNojJm1q5Q0NkJAAz","data":[],"key":"NF5OsFs3dq"}],"data":{},"key":"jYH6CCA8vw"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"What’s the ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"QIQLCWggzI"},{"type":"emphasis","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"bDqoxub2Zr"}],"key":"wkKhDkZZI2"},{"type":"text","value":" strategy for the agent, i.e. the one that achieves\nthe highest expected reward? Convince yourself that the agent should try\nto always pull the arm with the highest expected reward:","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"ZTXDuZDufz"}],"key":"SQYCTWILXW"},{"type":"math","value":"\\mu^\\star := \\max_{k \\in [K]} \\mu^k.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"μ:=maxk[K]μk.\\mu^\\star := \\max_{k \\in [K]} \\mu^k.μ:=k[K]maxμk.","enumerator":"3.1","key":"VUmZeIJgzs"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"The goal, then, can be rephrased as to minimize the ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"QCqHYKFo3f"},{"type":"strong","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"wUDPqTnHJP"}],"key":"hZcXEAb6Pa"},{"type":"text","value":", defined\nbelow:","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"HV9YEwEfe1"}],"key":"UGt4tHskML"},{"type":"proof","kind":"definition","label":"regret","identifier":"regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Regret","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"BPPMSSBy3R"}],"key":"u8tx3IlKgB"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"The agent’s ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"GogLfxwkG7"},{"type":"strong","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"phaxTuWurx"}],"key":"S01Hl1tlTW"},{"type":"text","value":" after ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"YsfC2I4Ob1"},{"type":"inlineMath","value":"T","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"TTT","key":"MByZ0RVTJA"},{"type":"text","value":" timesteps is defined as","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"iP2Rstx9OK"}],"key":"T5OFimXKVf"},{"type":"math","value":"\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.","position":{"start":{"line":163,"column":1},"end":{"line":165,"column":1}},"html":"RegretT:=t=0T1μμat.\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.RegretT:=t=0T1μμat.","enumerator":"3.2","key":"itXINkXY8Y"}],"enumerator":"3.1","html_id":"regret","key":"VWqFsTMAoe"}],"key":"t3fGpB8EEU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def regret_per_step(mab: MAB, agent: Agent):\n \"\"\"Get the difference from the average reward of the optimal arm. The sum of these is the regret.\"\"\"\n return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]","key":"jvZ7j2E6eN"},{"type":"output","id":"yZAdfCijgwECGHGhITuM0","data":[],"key":"rJKniOdKkd"}],"data":{},"key":"xDih1GYShA"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":178,"column":1}},"children":[{"type":"text","value":"Note that this depends on the ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"NWWmhmqxTh"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"true means","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"ZFMkHodDg4"}],"key":"gCZcJmXUXC"},{"type":"text","value":" of the pulled arms, ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"KBWfPcknt7"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"lUQIyqrHfC"}],"key":"q4rEVxkomJ"},{"type":"text","value":" the actual\nobserved rewards.\nWe typically think of this as a random variable where\nthe randomness comes from the agent’s strategy (i.e. the sequence of\nactions ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"J5GHEb16Fd"},{"type":"inlineMath","value":"a_0, \\dots, a_{T-1}","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"html":"a0,,aT1a_0, \\dots, a_{T-1}a0,,aT1","key":"NiygrE8NKp"},{"type":"text","value":").","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"lsathjhhSW"}],"key":"CxNab0sk46"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"Throughout the chapter, we will try to upper bound the regret of various\nalgorithms in two different senses:","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"ROA7i8HF7g"}],"key":"HPTy45MZ60"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":183,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":183,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"text","value":"Upper bound the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"m9bypTaZcu"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"expected regret,","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"aNIJ10yEYF"}],"key":"KitGlFR1zQ"},{"type":"text","value":" i.e. show\n","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"L2WQhk5Ycc"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] \\le M_T","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"E[RegretT]MT\\E[\\text{Regret}_T] \\le M_TE[RegretT]MT","key":"nRIrtgavAD"},{"type":"text","value":".","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"a6a9tI9CHr"}],"key":"oxfbyClNDV"}],"key":"Dmz5OxGM2L"},{"type":"listItem","spread":true,"position":{"start":{"line":186,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":186,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"Find a ","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"gX01am16OP"},{"type":"emphasis","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"text","value":"high-probability","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"LbzlYE8lIq"}],"key":"Ooi8SQ4UyU"},{"type":"text","value":" upper bound on the regret, i.e. show\n","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"AD0R1ng8RK"},{"type":"inlineMath","value":"\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\delta","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"html":"P(RegretTMT,δ)1δ\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\deltaP(RegretTMT,δ)1δ","key":"APh5ycwyX4"},{"type":"text","value":".","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"fwUw4rtivX"}],"key":"AshKXLm1L2"}],"key":"a8D7yYMDix"}],"key":"V9b6qKFrGC"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"Note that these two different approaches say very different things about the regret. The first approach says that the ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"cocNsYnSS8"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"mAqMIiRc4f"}],"key":"Wt2KJh7bh9"},{"type":"text","value":" regret is at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"kHrt4J0MGk"},{"type":"inlineMath","value":"M_T","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MTM_TMT","key":"hnZZjQSfoK"},{"type":"text","value":". However, the agent might still achieve higher regret on many runs. The second approach says that, ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"dFAwtNdjCD"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"with high probability","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"KjpVxUEcEi"}],"key":"zGvG2ooJWq"},{"type":"text","value":", the agent will achieve regret at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"lujKcetkop"},{"type":"inlineMath","value":"M_{T, \\delta}","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MT,δM_{T, \\delta}MT,δ","key":"pDynFUe6Nb"},{"type":"text","value":". However, it doesn’t say anything about the regret in the remaining ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"opr2P9eN6P"},{"type":"text","value":"δ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"x3fhyquJlr"},{"type":"text","value":" fraction of runs, which might be arbitrarily high.","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"UAdKdp9qhc"}],"key":"nOBHywcfFA"},{"type":"paragraph","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"We’d like to achieve ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"Qgseuiw2tR"},{"type":"strong","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"sublinear regret","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"qJJPQBc4yV"}],"key":"TQdFmKq5XL"},{"type":"text","value":" in expectation, i.e. ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"bY0QDcq6QA"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] = o(T)","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"html":"E[RegretT]=o(T)\\E[\\text{Regret}_T] = o(T)E[RegretT]=o(T)","key":"T8Kk1Uey74"},{"type":"text","value":". That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"L9ZuAnESqS"}],"key":"CetsxQ8MDf"},{"type":"paragraph","position":{"start":{"line":193,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"The rest of the chapter comprises a series of increasingly sophisticated\nMAB algorithms.","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"LnVj6HBJXd"}],"key":"VzI4wjbjf1"}],"key":"opKBiFUd1n"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def plot_strategy(mab: MAB, agent: Agent):\n plt.figure(figsize=(10, 6))\n\n # plot reward and cumulative regret\n plt.plot(np.arange(mab.T), np.cumsum(agent.rewards), label=\"reward\")\n cum_regret = np.cumsum(regret_per_step(mab, agent))\n plt.plot(np.arange(mab.T), cum_regret, label=\"cumulative regret\")\n\n # draw colored circles for arm choices\n colors = [\"red\", \"green\", \"blue\"]\n color_array = [colors[k] for k in agent.choices]\n plt.scatter(np.arange(mab.T), np.zeros(mab.T), c=color_array, label=\"arm\")\n\n # labels and title\n plt.xlabel(\"timestep\")\n plt.legend()\n plt.title(f\"{agent.__class__.__name__} reward and regret\")\n plt.show()","visibility":"hide","key":"EG846uP9n9"},{"type":"output","id":"qNqZ7SHOwlBeuud3V2_V5","data":[],"visibility":"show","key":"gynNpomIRr"}],"data":{"tags":[]},"visibility":"show","key":"J75CkM3xtM"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"children":[{"type":"text","value":"Pure exploration (random guessing)","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"l62w0ze1Co"}],"identifier":"pure-exploration-random-guessing","label":"Pure exploration (random guessing)","html_id":"pure-exploration-random-guessing","implicit":true,"enumerator":"3.2","key":"JHZSMmguhk"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":222,"column":1}},"children":[{"type":"text","value":"A trivial strategy is to always choose arms at random (i.e. “pure\nexploration”).","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"tooJwPmUvt"}],"key":"R7t4HI4jUp"}],"key":"QI3SmAHpLS"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureExploration(Agent):\n def choose_arm(self):\n \"\"\"Choose an arm uniformly at random.\"\"\"\n return solutions.pure_exploration_choose_arm(self)","identifier":"pure_exploration-code","enumerator":"3.1","html_id":"pure-exploration-code","key":"IddyJpGmXh"},{"type":"output","id":"1S6BJkONUjDFzNcNE__-s","data":[],"identifier":"pure_exploration-output","enumerator":"3.1","html_id":"pure-exploration-output","key":"e9OdCKoxgX"}],"data":{},"label":"pure_exploration","identifier":"pure_exploration","enumerator":"3.1","html_id":"pure-exploration","key":"XfWdkvRCwS"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Note that","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"RrgpIXnx2u"}],"key":"DlRDVTnGkb"},{"type":"math","value":"\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^k","position":{"start":{"line":235,"column":1},"end":{"line":237,"column":1}},"html":"EatUnif([K])[μat]=μˉ=1Kk=1Kμk\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^kEatUnif([K])[μat]=μˉ=K1k=1Kμk","enumerator":"3.3","key":"yac5ujYbVe"},{"type":"paragraph","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"so the expected regret is simply","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"FotkWHF6kO"}],"key":"yDUVkS1Dx8"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}","position":{"start":{"line":241,"column":1},"end":{"line":246,"column":1}},"html":"E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.","enumerator":"3.4","key":"hThbtvbnV3"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"This scales as ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"fUCfFQPYS1"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"o7layOA1GV"},{"type":"text","value":", i.e. ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"Sax1MHy8JO"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"xNb6GlYNF0"}],"key":"eCDAYoFmPQ"},{"type":"text","value":" in the number of timesteps ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"W7pdeFTRyl"},{"type":"inlineMath","value":"T","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"TTT","key":"dJtzvCMWcA"},{"type":"text","value":". There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"ffca0nvUn6"}],"key":"Ww5s32J9xi"}],"key":"vwuGV6EIQy"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureExploration(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"mTwFP24Ue3"},{"type":"output","id":"U6lrNi3FYZONd1LZaXEmk","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"e018a4b689feff2c40f2483432d7c76f","path":"/build/e018a4b689feff2c40f2483432d7c76f.png"}}}],"key":"dl2Sh3mqRw"}],"data":{},"key":"ic1R0xd61w"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Pure greedy","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"wjIWkz1loC"}],"identifier":"pure-greedy","label":"Pure greedy","html_id":"pure-greedy","implicit":true,"enumerator":"3.3","key":"aa8mEPTevJ"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"text","value":"How might we improve on pure exploration? Instead, we could try each arm\nonce, and then commit to the one with the highest observed reward. We’ll\ncall this the ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"CXTBjo7Dm1"},{"type":"strong","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"pure greedy","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"DbikD9Pn1m"}],"key":"ENrZbDb36n"},{"type":"text","value":" strategy.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"AiN0qTOqZD"}],"key":"NA6r7KXWei"}],"key":"wmMXTJbWGT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureGreedy(Agent):\n def choose_arm(self):\n \"\"\"Choose the arm with the highest observed reward on its first pull.\"\"\"\n return solutions.pure_greedy_choose_arm(self)","identifier":"pure_greedy-code","enumerator":"3.2","html_id":"pure-greedy-code","key":"fnmWsFQmSQ"},{"type":"output","id":"bPlu65MxQqmTMNe2SFOrY","data":[],"identifier":"pure_greedy-output","enumerator":"3.2","html_id":"pure-greedy-output","key":"PArUyguBQS"}],"data":{},"label":"pure_greedy","identifier":"pure_greedy","enumerator":"3.2","html_id":"pure-greedy","key":"Sjlon2Xl7D"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Note we’ve used superscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"P0qCa84XUt"},{"type":"inlineMath","value":"r^k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rkr^krk","key":"nurJeuHj7V"},{"type":"text","value":" during the exploration phase to\nindicate that we observe exactly one reward for each arm. Then we use\nsubscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"Lt5MwTyVsC"},{"type":"inlineMath","value":"r_t","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rtr_trt","key":"bXdsWw4IYj"},{"type":"text","value":" during the exploitation phase to indicate that we\nobserve a sequence of rewards from the chosen greedy arm ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"azCOU7H8wt"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"k^\\hat kk^","key":"T98bTQxjUq"},{"type":"text","value":".","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"vRueTVRUSE"}],"key":"xskttodLJl"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":279,"column":1}},"children":[{"type":"text","value":"How does the expected regret of this strategy compare to that of pure\nexploration? We’ll do a more general analysis in the following section.\nNow, for intuition, suppose there’s just ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"Zzm20xg2FO"},{"type":"inlineMath","value":"K=2","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"K=2K=2K=2","key":"FhXTBEAcQP"},{"type":"text","value":" arms, with Bernoulli\nreward distributions with means ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"HCK6gqPuLK"},{"type":"inlineMath","value":"\\mu^0 > \\mu^1","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"μ0>μ1\\mu^0 > \\mu^1μ0>μ1","key":"EpUpMJ7HP9"},{"type":"text","value":".","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"QJdFEWkbMg"}],"key":"tuZcUzmdgM"},{"type":"paragraph","position":{"start":{"line":281,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Let’s let ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"fGrtPP4Sdk"},{"type":"inlineMath","value":"r^0","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0r^0r0","key":"XfmrY1s9fn"},{"type":"text","value":" be the random reward from the first arm and ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"oIUVjfPKgw"},{"type":"inlineMath","value":"r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r1r^1r1","key":"Y0xU1taJWq"},{"type":"text","value":" be the\nrandom reward from the second. If ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"V7SFXUcqUv"},{"type":"inlineMath","value":"r^0 > r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0>r1r^0 > r^1r0>r1","key":"eV5IAGNqEY"},{"type":"text","value":", then we achieve zero\nregret. Otherwise, we achieve regret ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"fNeOTnz8rl"},{"type":"inlineMath","value":"T(\\mu^0 - \\mu^1)","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"T(μ0μ1)T(\\mu^0 - \\mu^1)T(μ0μ1)","key":"A7x8fQcSTl"},{"type":"text","value":". Thus, the\nexpected regret is simply:","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"cXdjftRd8P"}],"key":"GvuSkOCbPv"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}","position":{"start":{"line":286,"column":1},"end":{"line":291,"column":1}},"html":"E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c","enumerator":"3.5","key":"ZgX2NbB2AZ"},{"type":"paragraph","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"children":[{"type":"text","value":"Which is still ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"Tl2WqrfScj"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"vbtUhPRFhj"},{"type":"text","value":", the same as pure exploration!","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"TzFAXhVKcW"}],"key":"wfDYNzK7ka"}],"key":"nIvQPxCncJ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureGreedy(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"fOPybr5DjB"},{"type":"output","id":"tFQttZ4A4i6KhP5x7tiH4","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"60449ce2034aedba8d659c77e97c9729","path":"/build/60449ce2034aedba8d659c77e97c9729.png"}}}],"key":"ynkZMg0YlM"}],"data":{},"key":"TBNJgdr5yL"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Elu6Dqp54w"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"iKc3fCEGff"}],"key":"F9Qn2SQ4T4"},{"type":"text","value":" regret is what measures its effectiveness.","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"bT1GewcZoY"}],"key":"IwEjeU3Ycs"}],"key":"FpEGhhUIJQ"},{"type":"block","position":{"start":{"line":303,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Explore-then-commit","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"yrN9U8YMdv"}],"label":"etc","identifier":"etc","html_id":"etc","enumerator":"3.4","key":"RLBWtnFZP4"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"aGjPnNY6Pr"},{"type":"inlineMath","value":"N_{\\text{explore}}> 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore>1N_{\\text{explore}}> 1Nexplore>1","key":"SEXIU4wwPs"},{"type":"text","value":" times before committing. This is called the ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"E4mC336u6O"},{"type":"strong","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"explore-then-commit","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"ww5ESic92H"}],"key":"xIxnyeTCA8"},{"type":"text","value":" strategy. Note that the “pure greedy” strategy above is just the special case where\n","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"bGh239jBTw"},{"type":"inlineMath","value":"N_{\\text{explore}}= 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore=1N_{\\text{explore}}= 1Nexplore=1","key":"vvPCYHaEeh"},{"type":"text","value":".","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"Y4WxZx0iyb"}],"key":"rrEkl3pGCn"}],"key":"sBzRvtB09g"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ExploreThenCommit(Agent):\n def __init__(self, K: int, T: int, N_explore: int):\n super().__init__(K, T)\n self.N_explore = N_explore\n\n def choose_arm(self):\n return solutions.etc_choose_arm(self)","key":"Gg2cIObHOG"},{"type":"output","id":"zZ5SqXy4CJqspAAYekT4k","data":[],"key":"KXY9cc37Mp"}],"data":{},"key":"ZwqA6AokLN"},{"type":"block","children":[],"key":"qm46ncIJBy"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"cCEIgWiZKm"},{"type":"output","id":"83hjd2X7NUR4RdbV-7eZU","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"dde6263087532775cde0fb2de5a471cc","path":"/build/dde6263087532775cde0fb2de5a471cc.png"}}}],"key":"FWzid5dxLI"}],"data":{},"key":"KfUEkJN1Dl"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"children":[{"type":"text","value":"Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"H9wWOeKfkJ"}],"key":"XwxEyLAURh"}],"key":"pNk3LEmWoA"},{"type":"block","position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"children":[{"type":"text","value":"ETC regret analysis","position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"key":"oeEyUS39dF"}],"label":"etc-regret-analysis","identifier":"etc-regret-analysis","html_id":"etc-regret-analysis","enumerator":"3.4.1","key":"yu5jADZxxo"},{"type":"paragraph","position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up\ninto the exploration and exploitation phases.","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"dy7dMu5ab4"}],"key":"pgI94t26hT"},{"type":"heading","depth":4,"position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"Exploration phase.","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"LxEiXJLl8K"}],"identifier":"exploration-phase","label":"Exploration phase.","html_id":"exploration-phase","implicit":true,"enumerator":"3.4.1.1","key":"sZMeInJKz9"},{"type":"paragraph","position":{"start":{"line":339,"column":1},"end":{"line":341,"column":1}},"children":[{"type":"text","value":"This phase takes ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"xfxobjeHxn"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"guz1V4Uylz"},{"type":"text","value":" timesteps. Since at each step we\nincur at most ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"YyVVAlJg5P"},{"type":"text","value":"1","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"m62xoqrTO0"},{"type":"text","value":" regret, the total regret is at most\n","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"YMPj9fIZWZ"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"wT0DtzbSBj"},{"type":"text","value":".","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"sz7QyJw0JZ"}],"key":"A9a0t7btJ2"},{"type":"heading","depth":4,"position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"Exploitation phase.","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"PYw6azBUur"}],"identifier":"exploitation-phase","label":"Exploitation phase.","html_id":"exploitation-phase","implicit":true,"enumerator":"3.4.1.2","key":"mDRDmSceoh"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"This will take a bit more effort. We’ll prove that for any total time ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"rfs2cJ8cZH"},{"type":"inlineMath","value":"T","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"TTT","key":"ON2v3eSKBN"},{"type":"text","value":", we can choose ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"S6fPJ1QtQR"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"Ie27CdlgVB"},{"type":"text","value":" such that with arbitrarily high probability, the regret is sublinear.","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"dxdDUKmtum"}],"key":"PQtNU9Uq6G"},{"type":"paragraph","position":{"start":{"line":347,"column":1},"end":{"line":348,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"x5LhPkZ5Y0"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"html":"k^\\hat kk^","key":"hoAdEEzprt"},{"type":"text","value":" denote the arm chosen after the exploration phase. We know the regret from the\nexploitation phase is","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"sDISPydKaT"}],"key":"wORFzZ3a82"},{"type":"math","value":"T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"html":"Texploit(μμk^)whereTexploit:=TNexploreK.T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.Texploit(μμk^)whereTexploit:=TNexploreK.","enumerator":"3.6","key":"R0D86ImVCo"},{"type":"paragraph","position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"So we’d like to bound ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"NtqlmSeGqF"},{"type":"inlineMath","value":"\\mu^\\star - \\mu^{\\hat k} = o(1)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"μμk^=o(1)\\mu^\\star - \\mu^{\\hat k} = o(1)μμk^=o(1)","key":"NviFPkgwvz"},{"type":"text","value":" (as a function\nof ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"VAZMuju1uD"},{"type":"inlineMath","value":"T","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"TTT","key":"bgHXCiPB82"},{"type":"text","value":") in order to achieve sublinear regret. How can we do this?","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"ijIKU3PcnB"}],"key":"TfPxpkvcei"},{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Let’s define ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"VO7F6YnGUS"},{"type":"inlineMath","value":"\\Delta^k = \\hat \\mu^k - \\mu^k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"Δk=μ^kμk\\Delta^k = \\hat \\mu^k - \\mu^kΔk=μ^kμk","key":"J6sMNgIfdb"},{"type":"text","value":" to denote how far the mean\nestimate for arm ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"L45Az9Tlah"},{"type":"inlineMath","value":"k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"kkk","key":"j3d6goXk7P"},{"type":"text","value":" is from the true mean. How can we bound this\nquantity? We’ll use the following useful inequality for i.i.d. bounded\nrandom variables:","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"C6py7gRI8C"}],"key":"R1bGHXehZh"},{"type":"proof","kind":"theorem","label":"hoeffding","identifier":"hoeffding","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Hoeffding’s inequality","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"ZoRJ91RljB"}],"key":"kmMziOzzSl"},{"type":"paragraph","position":{"start":{"line":363,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"hxMKCSdjfm"},{"type":"inlineMath","value":"X_0, \\dots, X_{n-1}","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"X0,,Xn1X_0, \\dots, X_{n-1}X0,,Xn1","key":"oaFG5HjUcm"},{"type":"text","value":" be i.i.d. random variables with\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"a9ckb1HYoQ"},{"type":"inlineMath","value":"X_i \\in [0, 1]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"Xi[0,1]X_i \\in [0, 1]Xi[0,1]","key":"wK9Jqddl2C"},{"type":"text","value":" almost surely for each ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"h07HGtIdcc"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"JFF2ZSR6Mv"},{"type":"text","value":". Then for any\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"rm2udeY8nm"},{"type":"inlineMath","value":"\\delta > 0","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"δ>0\\delta > 0δ>0","key":"viWFYLJVXO"},{"type":"text","value":",","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"J8e3m3ACp3"}],"key":"eExMc5V1KU"},{"type":"math","value":"\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"P(1ni=1n(XiE[Xi])>ln(2/δ)2n)δ.\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.P(n1i=1n(XiE[Xi])>2nln(2/δ))δ.","enumerator":"3.7","key":"sfj6lPFibl"}],"enumerator":"3.1","html_id":"hoeffding","key":"G88ISqfVEo"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"The proof of this inequality is beyond the scope of this book. See ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"Tz9uyKNopI"},{"type":"cite","kind":"narrative","label":"vershynin_high-dimensional_2018","identifier":"vershynin_high-dimensional_2018","children":[{"type":"text","value":"Vershynin (2018)","key":"kyRqw0hyVJ"}],"enumerator":"1","key":"irSPTTY6aD"},{"type":"text","value":" Chapter 2.2.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"jpyFBM8ewf"}],"key":"GJOWdDFDJR"},{"type":"paragraph","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"children":[{"type":"text","value":"We can apply this directly to the rewards for a given arm ","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"qcerJkJDYq"},{"type":"inlineMath","value":"k","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"html":"kkk","key":"uM6unro1UY"},{"type":"text","value":", since the rewards from that arm are i.i.d.:","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"fbhcvASxvJ"}],"key":"r3meg7hpIO"},{"type":"math","value":"\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.","label":"hoeffding-etc","identifier":"hoeffding-etc","html":"P(Δk>ln(2/δ)2Nexplore)δ.\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.P(Δk>2Nexploreln(2/δ))δ.","enumerator":"3.8","html_id":"hoeffding-etc","key":"xbSGLmHalO"},{"type":"paragraph","position":{"start":{"line":380,"column":1},"end":{"line":384,"column":1}},"children":[{"type":"text","value":"But note that we can’t apply this to arm ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"jcVQl8pGBQ"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"FPKz2WFC3K"},{"type":"text","value":" directly since\n","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"RFT4uwY99C"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"KAyAWRiUEO"},{"type":"text","value":" is itself a random variable. Instead, we need to “uniform-ize”\nthis bound across ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"kn5wYMmAIX"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"all","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"BtswJekHi2"}],"key":"bx89JXnNeB"},{"type":"text","value":" the arms, i.e. bound the error across all the\narms simultaneously, so that the resulting bound will apply ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"ypmZinRbw2"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"no matter\nwhat","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"dmkybx1kgm"}],"key":"uTJgvsd5RQ"},{"type":"text","value":" ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"CbOMAIrmkw"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"Y8FogyitIJ"},{"type":"text","value":" “crystallizes” to.","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"VYaXekCCiO"}],"key":"IVCqO1CmjR"},{"type":"paragraph","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"TWStIFhm6K"},{"type":"strong","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"EJwDLBPIdD"}],"key":"oir8Bf2g6k"},{"type":"text","value":" provides a simple way to do this:","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"tfLYXIoEPS"}],"key":"JkUnydWxl9"},{"type":"proof","kind":"theorem","label":"union_bound","identifier":"union_bound","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Union bound","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"a9ziuDAa7D"}],"key":"HPoIdkUaF2"},{"type":"paragraph","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"Consider a set of events ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"F6nseJ2DqY"},{"type":"inlineMath","value":"A_0, \\dots, A_{n-1}","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"html":"A0,,An1A_0, \\dots, A_{n-1}A0,,An1","key":"Jbo3hTAZfr"},{"type":"text","value":". Then","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"Kbq5xPquLp"}],"key":"hsgTL2WclF"},{"type":"math","value":"\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"html":"P(i[n].Ai)i=0n1P(Ai).\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).P(i[n].Ai)i=0n1P(Ai).","enumerator":"3.9","key":"dzdDA2WMGA"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":396,"column":1}},"children":[{"type":"text","value":"In\nparticular, if ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"a8LCgIA67k"},{"type":"inlineMath","value":"\\pr(A_i) \\ge 1 - \\delta","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"P(Ai)1δ\\pr(A_i) \\ge 1 - \\deltaP(Ai)1δ","key":"eAWjVn9dF7"},{"type":"text","value":" for each ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"BfP0dB08fL"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"Qc8TzEXdC1"},{"type":"text","value":", we have","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"CJsgQvgXXi"}],"key":"Av6ld36v5p"},{"type":"math","value":"\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"P(i[n].Ai)1nδ.\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.P(i[n].Ai)1nδ.","enumerator":"3.10","key":"mDMfgoEQiH"}],"enumerator":"3.2","html_id":"union-bound","key":"REgQuxUbOG"},{"type":"paragraph","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"strong","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"CtNmXOZqhf"}],"key":"jG3urHpctQ"},{"type":"text","value":" Prove the second statement above.","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"VxPq2Fbowy"}],"key":"eO3CtZw1Ya"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Applying the union bound across the arms for the l.h.s. event of ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"th7UFGalmm"},{"type":"crossReference","kind":"equation","identifier":"hoeffding-etc","label":"hoeffding-etc","children":[{"type":"text","value":"(","key":"c8GyvmSWNI"},{"type":"text","value":"3.8","key":"VL8nr9Zz7X"},{"type":"text","value":")","key":"rFdd7xlMeJ"}],"template":"(%s)","enumerator":"3.8","resolved":true,"html_id":"hoeffding-etc","key":"qOVYbI7zsA"},{"type":"text","value":", we have","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"mQMA9ri4eH"}],"key":"ELDVaLnzdZ"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}","position":{"start":{"line":405,"column":1},"end":{"line":409,"column":1}},"html":"P(k[K],Δkln(2/δ)2Nexplore)1Kδ\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}P(k[K],Δk2Nexploreln(2/δ))1","enumerator":"3.11","key":"FW1TYqqliE"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"Then to apply this bound to ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"j4QM9rEpz3"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"k^\\hat kk^","key":"Ie7etmAvQL"},{"type":"text","value":" in particular, we\ncan apply the useful trick of “adding zero”:","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"P9mVpUzhVB"}],"key":"MSEQgT67mK"},{"type":"math","value":"\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}","position":{"start":{"line":414,"column":1},"end":{"line":420,"column":1}},"html":"μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+0 by definition of k^(μ^kμ^k^)22Nexploreln(2K/δ) with probability at least 1δ","enumerator":"3.12","key":"P4VQwIIUwR"},{"type":"paragraph","position":{"start":{"line":422,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"where we’ve set ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"NuTq3aEvSf"},{"type":"inlineMath","value":"\\delta' = K\\delta","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"δ=Kδ\\delta' = K\\deltaδ=","key":"eWCV3SkPIe"},{"type":"text","value":". Putting this all\ntogether, we’ve shown that, with probability ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"y9YNdJTtJy"},{"type":"inlineMath","value":"1 - \\delta'","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"1δ1 - \\delta'1δ","key":"aSFpuTtxrT"},{"type":"text","value":",","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"N82zhkpTXr"}],"key":"oTf1RtCmtK"},{"type":"math","value":"\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.RegretTNexploreK+TexploitNexplore2ln(2K/δ).","enumerator":"3.13","key":"epYZsPhw9r"},{"type":"paragraph","position":{"start":{"line":427,"column":1},"end":{"line":430,"column":1}},"children":[{"type":"text","value":"Note that it suffices for ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"uNqBEojYIQ"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"OpLj7k9EvT"},{"type":"text","value":" to be on the order of\n","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"YsYpf7WaTX"},{"type":"inlineMath","value":"\\sqrt{T}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"T\\sqrt{T}T","key":"bV44dDvpi2"},{"type":"text","value":" to achieve sublinear regret. In particular, we can find the\noptimal ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"SItNT6y0Pg"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"JsRPqamD8p"},{"type":"text","value":" by setting the derivative of the r.h.s. to\nzero:","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"Tvmu2VlGuU"}],"key":"swDlBTLFRp"},{"type":"math","value":"\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}","position":{"start":{"line":432,"column":1},"end":{"line":437,"column":1}},"html":"0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}0Nexplore=KTexploit21Nexplore32ln(2K/δ)=(TexploitKln(2K/δ)/2)2/3","enumerator":"3.14","key":"id5azWmM4v"},{"type":"paragraph","position":{"start":{"line":439,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Plugging this into the expression for the regret, we\nhave (still with probability ","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"RWzIEXCb5J"},{"type":"inlineMath","value":"1-\\delta'","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"html":"1δ1-\\delta'1δ","key":"AN2nONDihe"},{"type":"text","value":")","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"vFiRtfClNZ"}],"key":"B06h9sHA6F"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}","position":{"start":{"line":442,"column":1},"end":{"line":447,"column":1}},"html":"RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}RegretT3T2/33Kln(2K/δ)/2=O~(T2/3K1/3).","enumerator":"3.15","key":"MqxhpcwHrH"},{"type":"paragraph","position":{"start":{"line":449,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"text","value":"The ETC algorithm is rather “abrupt” in that it switches from\nexploration to exploitation after a fixed number of timesteps. In\npractice, it’s often better to use a more gradual transition, which\nbrings us to the ","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"yonGcvWZ5A"},{"type":"emphasis","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"BKS7SR8j2l"}],"key":"WYHcFQq2MP"},{"type":"text","value":" algorithm.","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"eUZUpGXJFy"}],"key":"P2vYT0I0Bj"}],"key":"cjBeXopxsP"},{"type":"block","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"children":[{"type":"text","value":"Epsilon-greedy","position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"key":"ji8XNPkVJU"}],"identifier":"epsilon-greedy","label":"Epsilon-greedy","html_id":"epsilon-greedy","implicit":true,"enumerator":"3.5","key":"NLvCUx9LtI"},{"type":"paragraph","position":{"start":{"line":458,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"Instead of doing all of the exploration and then all of the exploitation\nseparately – which additionally requires knowing the time horizon\nbeforehand – we can instead interleave exploration and exploitation by,\nat each timestep, choosing a random action with some probability. We\ncall this the ","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"tqSmwAsEAc"},{"type":"strong","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"wJTMvhdOq9"}],"key":"WdKUkLjhIj"},{"type":"text","value":" algorithm.","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"WjXgw7FJ3C"}],"key":"SuAKAjb3ZB"}],"key":"S3Ng9bz0aa"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class EpsilonGreedy(Agent):\n def __init__(\n self,\n K: int,\n T: int,\n ε_array: Float[Array, \" T\"],\n ):\n super().__init__(K, T)\n self.ε_array = ε_array\n\n def choose_arm(self):\n return solutions.epsilon_greedy_choose_arm(self)","key":"x5ULJq8IZg"},{"type":"output","id":"DDgEyKtxzNkrVJwR4bLkY","data":[],"key":"aNEiLPnumS"}],"data":{},"key":"sEaNrmuOPP"},{"type":"block","children":[],"key":"J2L3LTuMjT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"Zsne9effF7"},{"type":"output","id":"ifd9Tm1uOL39NkNTliiN6","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"6ad1018e4c18668300eb6bbe80bdc84f","path":"/build/6ad1018e4c18668300eb6bbe80bdc84f.png"}}}],"key":"tnmncr89k2"}],"data":{},"key":"eQxevAZP4A"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that we let ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"hogdLhI4W7"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"zhOf537OVW"},{"type":"text","value":" vary over time. In particular, we might want to gradually ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"YAQ0O39pTp"},{"type":"emphasis","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"decrease","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"NRCcrWwqys"}],"key":"InOXzFmY4I"},{"type":"text","value":" ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"n4uXxssn8N"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"HxFa0y7X2k"},{"type":"text","value":" as we learn more about the reward distributions and no longer need to spend time exploring.","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"DrBTDeA5Ig"}],"key":"BpEOOygwXG"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"ruk9aoyi6Q"}],"key":"ndxjkKiCsq"},{"type":"paragraph","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"What is the expected regret of the algorithm if we set ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"iS1qgE4jzx"},{"type":"text","value":"ε","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"NrhlBNORpE"},{"type":"text","value":" to be a constant?","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"a7OogPimfw"}],"key":"MjZ5AgC2Ju"}],"key":"jcp6nacDlz"},{"type":"paragraph","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"children":[{"type":"text","value":"It turns out that setting ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"Cn4w0N2irW"},{"type":"inlineMath","value":"\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"ϵt=Kln(t)/t3\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}ϵt=3Kln(t)/t","key":"S0baTq2yeK"},{"type":"text","value":" also achieves a regret of ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"dFaTXlN8AJ"},{"type":"inlineMath","value":"\\tilde O(t^{2/3} K^{1/3})","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"O~(t2/3K1/3)\\tilde O(t^{2/3} K^{1/3})O~(t2/3K1/3)","key":"g4Ur928bg7"},{"type":"text","value":" (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"eUVlBfTcSf"}],"key":"YIqL4M6Jo8"},{"type":"paragraph","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"In ETC, we had to set ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"lPlaIuNwsP"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"iYPPxnpxjE"},{"type":"text","value":" based on the total number of timesteps ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"lzOs9b3DLl"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"BmJBjJDvOR"},{"type":"text","value":". But the epsilon-greedy algorithm actually handles the exploration ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"LohbUKVxzr"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"automatically","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"PnFKzgc9YC"}],"key":"xXwvBZ300j"},{"type":"text","value":": the regret rate holds for ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"ytw6yr8Z4P"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"uvpO4DdwPp"}],"key":"argQq60ENl"},{"type":"text","value":" ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"YkYR7OMr2D"},{"type":"inlineMath","value":"t","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"ttt","key":"xNBBZrR1mG"},{"type":"text","value":", and doesn’t depend on the final horizon ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"ZwvCfLWM4f"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"UK0pNL5826"},{"type":"text","value":".","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"BlnQ8B0M95"}],"key":"PndiD3ygdz"},{"type":"paragraph","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"But the way these algorithms explore is rather naive: we’ve been exploring ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"aUjskVDCmK"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"JlbuK0IpZB"}],"key":"YCcPkCcLLn"},{"type":"text","value":" across all the arms. But what if we could be smarter about it, and explore ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"QkfNWXBRi3"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"more","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"j1HlEu0hOZ"}],"key":"glEi9Iqeps"},{"type":"text","value":" for arms that we’re less certain about?","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"XM73HMF8Fp"}],"key":"pRsdDeMzuX"}],"key":"ZIml5tYJiI"},{"type":"block","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"r0DdkjfUzw"}],"label":"ucb","identifier":"ucb","html_id":"ucb","enumerator":"3.6","key":"xX0xm17eu8"},{"type":"paragraph","position":{"start":{"line":502,"column":1},"end":{"line":506,"column":1}},"children":[{"type":"text","value":"To quantify how ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"AhSGwQ18iC"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"certain","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"HjcYr7fNJ8"}],"key":"EDetClK3Ui"},{"type":"text","value":" we are about the mean of each arm, we’ll\ncompute ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"OEPqktHlYI"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"confidence intervals","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"Ji5F0WczcD"}],"key":"LiRtyHQbDd"},{"type":"text","value":" for our estimators, and then choose the\narm with the highest ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"CatXuG8nzI"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"UzPVy9GBRw"}],"key":"LPNZce77OL"},{"type":"text","value":". This operates on the\nprinciple of ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"t5MRFSz92i"},{"type":"strong","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"the benefit of the doubt (i.e. optimism in the face of\nuncertainty)","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"jaFOmKCl2U"}],"key":"SNiwWSfsbs"},{"type":"text","value":": we’ll choose the arm that we’re most optimistic about.","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"U4aON9bDtZ"}],"key":"K8J9jVIPae"},{"type":"paragraph","position":{"start":{"line":508,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"text","value":"In particular, for each arm ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"lNdVzWCuES"},{"type":"inlineMath","value":"k","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"kkk","key":"WYfaE8DpSv"},{"type":"text","value":" at time ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"phnNjHBhDA"},{"type":"inlineMath","value":"t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"ttt","key":"chr0KhiPaW"},{"type":"text","value":", we’d like to compute some\nupper confidence bound ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"UMaoDrUotO"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"Q9AKptXrvF"},{"type":"text","value":" such that ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"AFdEGxT0OV"},{"type":"inlineMath","value":"\\hat \\mu^k_t \\le M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"μ^tkMtk\\hat \\mu^k_t \\le M^k_tμ^tkMtk","key":"ts5k7Hc8pp"},{"type":"text","value":" with\nhigh probability, and then choose ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"NlhYTK5lEr"},{"type":"inlineMath","value":"a_t := \\arg \\max_{k \\in [K]} M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"at:=argmaxk[K]Mtka_t := \\arg \\max_{k \\in [K]} M^k_tat:=argmaxk[K]Mtk","key":"ctnV6SJMpS"},{"type":"text","value":".\nBut how should we compute ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"Up4aa7O8Uh"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"O4GMlOEd5p"},{"type":"text","value":"?","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"PMR7MDY91v"}],"key":"DJzTAGiC9i"},{"type":"paragraph","position":{"start":{"line":513,"column":1},"end":{"line":519,"column":1}},"children":[{"type":"text","value":"In ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"wFQMfTCu48"},{"type":"crossReference","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"Section ","key":"jsAVbowuqP"},{"type":"text","value":"3.4.1","key":"TF5KNouHJd"}],"identifier":"etc-regret-analysis","label":"etc-regret-analysis","kind":"heading","template":"Section %s","enumerator":"3.4.1","resolved":true,"html_id":"etc-regret-analysis","key":"CbdKZnf06t"},{"type":"text","value":", we were able to compute this bound\nusing Hoeffding’s inequality, which assumes that the number of samples\nis ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"w6nbaKNFLJ"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"XdvFmJtEXs"}],"key":"j3ZSYyz26c"},{"type":"text","value":". This was the case in ETC (where we pull each arm\n","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"uA19jJe8JT"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"aKMA68hoQn"},{"type":"text","value":" times), but in UCB, the number of times we pull\neach arm depends on the agent’s actions, which in turn depend on the\nrandom rewards and are therefore stochastic. So we ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"WWo3SpiTFx"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"can’t","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"X3rxeonKB3"}],"key":"l4iCImMmnj"},{"type":"text","value":" use\nHoeffding’s inequality directly.","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"TD2aPjz168"}],"key":"MlOQy339GN"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Instead, we’ll apply the same trick we used in the ETC analysis: we’ll\nuse the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"ov1xPRhphh"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"xoWGLvbJ9P"}],"key":"ymTB12J70F"},{"type":"text","value":" to compute a ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"iR3dkEDFrk"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"looser","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"BCJmpQkiEj"}],"key":"axaxABl4HY"},{"type":"text","value":" bound that holds\n","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"hcZMIbR3pE"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"iDLisPFrn8"}],"key":"UsEKe8vsKh"},{"type":"text","value":" across all timesteps and arms. Let’s introduce some notation\nto discuss this.","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"DVtUVKgdE7"}],"key":"ibAlrWItpu"},{"type":"paragraph","position":{"start":{"line":526,"column":1},"end":{"line":528,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"IoehmXWf4r"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"NtkN^k_tNtk","key":"ZJQloC55ea"},{"type":"text","value":" denote the (random) number of times arm ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"Iz629Dr7dX"},{"type":"inlineMath","value":"k","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"kkk","key":"QWXmRssCJJ"},{"type":"text","value":" has been pulled\nwithin the first ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"L7GZ5Lj5lh"},{"type":"inlineMath","value":"t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"ttt","key":"mimNbsOyHf"},{"type":"text","value":" timesteps, and ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"r1c8eGiVkO"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"A6DpWgkrtB"},{"type":"text","value":" denote the sample\naverage of those pulls. That is,","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"aTI6AX3nen"}],"key":"yBDMG739Lp"},{"type":"math","value":"\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}","position":{"start":{"line":530,"column":1},"end":{"line":535,"column":1}},"html":"Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}Ntkμ^tk:=τ=0t11{aτ=k}:=Ntk1τ=0t11{aτ=k}rτ.","enumerator":"3.16","key":"aXBnJAEni9"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"To achieve the “fixed sample size” assumption, we’ll\nneed to shift our index from ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"bUMM8MQdiP"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"time","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"EHpH6DsdRn"}],"key":"QTjTnzagE2"},{"type":"text","value":" to ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"NLWopHvGOq"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"number of samples from each\narm","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"w2FRaaW1NJ"}],"key":"mLbmcxgNqx"},{"type":"text","value":". In particular, we’ll define ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"DvpDP4Sg8T"},{"type":"inlineMath","value":"\\tilde r^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"r~nk\\tilde r^k_nr~nk","key":"gr3eUkRsGi"},{"type":"text","value":" to be the ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"adncT6JZib"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"jRIhgqkMsr"},{"type":"text","value":"th sample\nfrom arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"jfeEx2qOon"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"xW0RrkqLxO"},{"type":"text","value":", and ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"jiSzFunFKj"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"hPIewl35y8"},{"type":"text","value":" to be the sample average of the first\n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Uot9QmVQE0"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"mxH95JHyOJ"},{"type":"text","value":" samples from arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"FWLZTFpe6V"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"VaEFxgdYP9"},{"type":"text","value":". Then, for a fixed ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Bjz2aXp7Mb"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"oRNZi4Ja4m"},{"type":"text","value":", this satisfies the\n“fixed sample size” assumption, and we can apply Hoeffding’s inequality\nto get a bound on ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"jnAelZeGx8"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"wwWjy5CWRn"},{"type":"text","value":".","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Gn1WJc9Wad"}],"key":"j1Mhd6Sc9r"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":549,"column":1}},"children":[{"type":"text","value":"So how can we extend our bound on ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"zlkTNGOwjP"},{"type":"inlineMath","value":"\\tilde\\mu^k_n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ~nk\\tilde\\mu^k_nμ~nk","key":"L50XHD3HhB"},{"type":"text","value":" to ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"HbzEerdYQU"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"kMuYfWOH3B"},{"type":"text","value":"?\nWell, we know ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"Ek0rRSERmc"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"ud2OBePCva"},{"type":"text","value":" (where equality would be the case if and\nonly if we had pulled arm ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"j14HVASvZk"},{"type":"inlineMath","value":"k","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"kkk","key":"zQIMhtzDpo"},{"type":"text","value":" every time). So we can apply the same\ntrick as last time, where we uniform-ize across all possible values of\n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"HKHbKA0mOn"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtkN^k_tNtk","key":"mL18EMihmj"},{"type":"text","value":":","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"C3NOw41Nr9"}],"key":"FWAqNSR8zE"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}","position":{"start":{"line":551,"column":1},"end":{"line":555,"column":1}},"html":"P(nt,μ~nkμkln(2/δ)2n)1tδ.\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}P(nt,μ~nkμk2nln(2/δ))1tδ.","enumerator":"3.17","key":"JRTFvHbIli"},{"type":"paragraph","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"children":[{"type":"text","value":"In particular, since ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"aXJAnjZ1bo"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"MLNWFcRAGL"},{"type":"text","value":", and ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"FbwE3WNdzb"},{"type":"inlineMath","value":"\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"μ~Ntkk=μ^tk\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_tμ~Ntkk=μ^tk","key":"g5g9IyYfqY"},{"type":"text","value":" by definition, we have","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"J4vvttexPQ"}],"key":"Um40Bett6m"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}","position":{"start":{"line":559,"column":1},"end":{"line":563,"column":1}},"html":"P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}P(μ^tkμk2Ntkln(2t/δ))1δ where δ:=tδ.","enumerator":"3.18","key":"yIu9pbdgtU"},{"type":"paragraph","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"children":[{"type":"text","value":"This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm ","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"arh9bTpp9T"},{"type":"inlineMath","value":"k","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"html":"kkk","key":"dpTWoMsoWT"},{"type":"text","value":" would be","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"g2W5q4hAWR"}],"key":"yUqOmBIaDE"},{"type":"math","value":"M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"html":"Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},Mtk:=μ^tk+2Ntkln(2t/δ),","enumerator":"3.19","key":"R8q46q6x42"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"where we can choose ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"eNlRNNat8O"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"δ\\delta'δ","key":"gFYyFeSQzI"},{"type":"text","value":" depending on how tight we want the interval to be.","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"b13McSYybJ"}],"key":"BvOfd9S9FP"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":571,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"text","value":"A smaller ","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"OsSvOIVovV"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"html":"δ\\delta'δ","key":"a4JBSNkrXM"},{"type":"text","value":" would give us a larger and higher-confidence interval, emphasizing the exploration term.","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"aKZLX30ElS"}],"key":"uxKA1FgsDq"},{"type":"listItem","spread":true,"position":{"start":{"line":572,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"text","value":"A larger ","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"LFLTF5MPB1"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"html":"δ\\delta'δ","key":"xdq8MtMFyw"},{"type":"text","value":" would give a tighter and lower-confidence interval, prioritizing the current sample averages.","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"PUA2whP84u"}],"key":"CnBGxM08cc"}],"key":"YI88sdkL7Y"},{"type":"paragraph","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"We can now use this to define the UCB algorithm.","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"prb8NjBWcP"}],"key":"pzUsu7vyAW"}],"key":"sR1wGw0OHQ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class UCB(Agent):\n def __init__(self, K: int, T: int, delta: float):\n super().__init__(K, T)\n self.delta = delta\n\n def choose_arm(self):\n return solutions.ucb_choose_arm(self)","key":"rZr12wdggi"},{"type":"output","id":"Vv9Dm7q7gUYsUp42k28tc","data":[],"key":"wz9KouEF0A"}],"data":{},"key":"kkRoZrSQix"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"Intuitively, UCB prioritizes arms where:","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"NKK07S6UmB"}],"key":"OV0hmR05HW"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":588,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":588,"column":1},"end":{"line":589,"column":1}},"children":[{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"wJmMOjc35f"},{"type":"text","value":" is large, i.e. the arm has a high sample average, and\nwe’d choose it for ","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"EJYM0oAaCj"},{"type":"emphasis","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"y3t0MGv1TZ"}],"key":"BlYrrNNMds"},{"type":"text","value":", and","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"DgFW8yYXja"}],"key":"g6LXeqM8nw"}],"key":"MaECBgVNbP"},{"type":"listItem","spread":true,"position":{"start":{"line":591,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":592,"column":1}},"children":[{"type":"inlineMath","value":"\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"html":"ln(2t/δ)2Ntk\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}2Ntkln(2t/δ)","key":"jKujB7IOjb"},{"type":"text","value":" is large, i.e. we’re still\nuncertain about the arm, and we’d choose it for ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"nEJPV3z6jh"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"hfnmiewzcW"}],"key":"JKYNpHV4II"},{"type":"text","value":".","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"ru0NxFuJpp"}],"key":"V62Qd93gN2"}],"key":"YlT1QMAtPq"}],"key":"rXaKitMDgR"},{"type":"paragraph","position":{"start":{"line":594,"column":1},"end":{"line":595,"column":1}},"children":[{"type":"text","value":"As desired, this explores in a smarter, ","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"BABPPN6h96"},{"type":"emphasis","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"adaptive","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"RlfTbM8oyR"}],"key":"fsy0iAiCwM"},{"type":"text","value":" way compared to the\nprevious algorithms. Does it achieve lower regret?","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"bQZmQkukgE"}],"key":"VsnmPxL8Ht"}],"key":"GXdaavM7j9"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = UCB(mab.K, mab.T, 0.9)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"Ay7ffZM6Bz"},{"type":"output","id":"WW8pnPLr2L2aLvDsSyh4V","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"f3eb002ad30c5ba869f3a828d502f4d2","path":"/build/f3eb002ad30c5ba869f3a828d502f4d2.png"}}}],"key":"Or8RQmjxQ0"}],"data":{},"key":"kHesWaFtJB"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"children":[{"type":"text","value":"UCB regret analysis","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"epCw3J1LyG"}],"identifier":"ucb-regret-analysis","label":"UCB regret analysis","html_id":"ucb-regret-analysis","implicit":true,"enumerator":"3.6.1","key":"u3y8qdaZbj"},{"type":"paragraph","position":{"start":{"line":605,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"First we’ll bound the regret incurred at each timestep. Then we’ll bound\nthe ","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"RbjSO83f2x"},{"type":"emphasis","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"children":[{"type":"text","value":"total","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"LxrW557NjS"}],"key":"jqCtzAikE9"},{"type":"text","value":" regret across timesteps.","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"gL0mMMCgdU"}],"key":"bIBUIghxNN"},{"type":"paragraph","position":{"start":{"line":608,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"For the sake of analysis, we’ll use a slightly looser bound that applies\nacross the whole time horizon and across all arms. We’ll omit the\nderivation since it’s very similar to the above (walk through it\nyourself for practice).","position":{"start":{"line":608,"column":1},"end":{"line":608,"column":1}},"key":"Ggxn0veYys"}],"key":"LSnhXptF0z"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}","position":{"start":{"line":613,"column":1},"end":{"line":618,"column":1}},"html":"P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}P(kK,t<T.∣μ^tkμkBtk)whereBtk1δ′′:=2Ntkln(2TK/δ′′).","enumerator":"3.20","key":"cCQnPxI53W"},{"type":"paragraph","position":{"start":{"line":620,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Intuitively, ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"xBQ45KXocd"},{"type":"inlineMath","value":"B^k_t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"BtkB^k_tBtk","key":"U6gmKNouGu"},{"type":"text","value":" denotes the ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"P3wogBoKiB"},{"type":"emphasis","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"width","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"YtjpW62Hc2"}],"key":"QaIRvsNJiY"},{"type":"text","value":" of the CI for arm ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"jHuBitN69l"},{"type":"inlineMath","value":"k","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"kkk","key":"CZOGh3s1u8"},{"type":"text","value":" at time\n","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"TRFYBb0dQl"},{"type":"inlineMath","value":"t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"ttt","key":"RABQu9ZT98"},{"type":"text","value":". Then, assuming the above uniform bound holds (which occurs with\nprobability ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"WXI8AO09tc"},{"type":"inlineMath","value":"1-\\delta''","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"1δ1-\\delta''1δ′′","key":"mFBXTDiOEI"},{"type":"text","value":"), we can bound the regret at each timestep as\nfollows:","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"skrGutYSV7"}],"key":"Hz2FIZgMk6"},{"type":"math","value":"\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}","position":{"start":{"line":625,"column":1},"end":{"line":631,"column":1}},"html":"μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}μμatμ^tk+Btkμatμ^tat+Btatμat2Btatapplying UCB to arm ksince UCB chooses at=argk[K]maxμ^tk+Btksince μ^tatμatBtat by definition of Btat","enumerator":"3.21","key":"necYujI7SF"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"Summing this across timesteps gives","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"muAiS7h7e0"}],"key":"HH6Owtt9vw"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}","position":{"start":{"line":635,"column":1},"end":{"line":647,"column":1}},"html":"RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}RegretTt=0T1(Ntat)1/2n=1Tn1/2t=0T12Btat=2ln(2TK/δ′′)t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T","enumerator":"3.22","key":"A7oHL4Ev6N"},{"type":"paragraph","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"children":[{"type":"text","value":"Putting everything together gives","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"key":"wNXyRXjqWj"}],"key":"Enza5yroBV"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}","position":{"start":{"line":651,"column":1},"end":{"line":656,"column":1}},"html":"RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}RegretT2K2Tln(2TK/δ′′)=O~(KT)with probability 1δ′′","enumerator":"3.23","key":"fsRyNmGJ2B"},{"type":"paragraph","position":{"start":{"line":658,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"In fact, we can do a more sophisticated analysis to trim off a factor of ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"qgjm1B0TTi"},{"type":"inlineMath","value":"\\sqrt{K}","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"K\\sqrt{K}K","key":"gUQP0aoMA7"},{"type":"text","value":"\nand show ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"Yy59mnOzoP"},{"type":"inlineMath","value":"\\text{Regret}_T = \\tilde O(\\sqrt{TK})","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"RegretT=O~(TK)\\text{Regret}_T = \\tilde O(\\sqrt{TK})RegretT=O~(TK)","key":"uDJteSLsxY"},{"type":"text","value":".","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"nN2zuqL5jb"}],"key":"tc4P66xaLC"}],"key":"ShmQkiqzZA"},{"type":"block","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"children":[{"type":"text","value":"Lower bound on regret (intuition)","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"Tj5cU1oU7F"}],"identifier":"lower-bound-on-regret-intuition","label":"Lower bound on regret (intuition)","html_id":"lower-bound-on-regret-intuition","implicit":true,"enumerator":"3.6.2","key":"vkLxpqMUhI"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":668,"column":1}},"children":[{"type":"text","value":"Is it possible to do better than ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"BVf2MrsXLO"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"MlJqW1U4b5"},{"type":"text","value":" in general? In fact,\nno! We can show that any algorithm must incur ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"yJWMTiIIn0"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"PKLVe4B1zG"},{"type":"text","value":" regret\nin the worst case. We won’t rigorously prove this here, but the\nintuition is as follows.","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"pjrzcQv69A"}],"key":"KQlAFaHZm2"},{"type":"paragraph","position":{"start":{"line":670,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"The Central Limit Theorem tells us that with ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"Ogiog7LZsN"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"MuKrXAfSjC"},{"type":"text","value":" i.i.d. samples from\nsome distribution, we can only learn the mean of the distribution to\nwithin ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"TcGGQcQAK0"},{"type":"inlineMath","value":"\\Omega(1/\\sqrt{T})","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"Ω(1/T)\\Omega(1/\\sqrt{T})Ω(1/T)","key":"z3vbB7aCOI"},{"type":"text","value":" (the standard deviation). Then, since we get\n","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"MYVzliz7II"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"B731zi4yNE"},{"type":"text","value":" samples spread out across the arms, we can only learn each arm’s\nmean to an even looser degree.","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"AyQM7YSrCF"}],"key":"Otjga3IBkc"},{"type":"paragraph","position":{"start":{"line":676,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"That is, if two arms have means that are within about ","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"kt8ZHMVGqU"},{"type":"inlineMath","value":"1/\\sqrt{T}","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"html":"1/T1/\\sqrt{T}1/T","key":"haocAAZuVp"},{"type":"text","value":", we\nwon’t be able to confidently tell them apart, and will sample them about\nequally. But then we’ll incur regret","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"KY3CZwSb4j"}],"key":"OupBkwFtl3"},{"type":"math","value":"\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"tight":"before","html":"Ω((T/2)(1/T))=Ω(T).\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).Ω((T/2)(1/T))=Ω(T).","enumerator":"3.24","key":"J8capJKmNr"}],"key":"P5nZnmqQpu"},{"type":"block","position":{"start":{"line":681,"column":1},"end":{"line":681,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"children":[{"type":"text","value":"Thompson sampling and Bayesian bandits","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"cG8cLhUGfG"}],"label":"thompson_sampling","identifier":"thompson_sampling","html_id":"thompson-sampling","enumerator":"3.7","key":"oWvpy5k3vl"},{"type":"paragraph","position":{"start":{"line":686,"column":1},"end":{"line":692,"column":1}},"children":[{"type":"text","value":"So far, we’ve treated the parameters ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"W1TfdNLIMR"},{"type":"inlineMath","value":"\\mu^0, \\dots, \\mu^{K-1}","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"html":"μ0,,μK1\\mu^0, \\dots, \\mu^{K-1}μ0,,μK1","key":"cAcLO945uG"},{"type":"text","value":" of the\nreward distributions as ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"g7hGOmPG26"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"UXz8iDCv24"}],"key":"oXWLTRvkwP"},{"type":"text","value":". Instead, we can take a ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"Y8x8AarEQG"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"Bayesian","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"aE4n6zyFTF"}],"key":"WHkrTwMsPr"},{"type":"text","value":"\napproach where we treat them as random variables from some ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"IOoZl7gvjm"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"prior\ndistribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"SW1P1243Bl"}],"key":"HCFgysQscw"},{"type":"text","value":". Then, upon pulling an arm and observing a reward, we can\nsimply ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"Va8JosprND"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"condition","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"ZdzH9TSBy7"}],"key":"UEOWhFguOu"},{"type":"text","value":" on this observation to exactly describe the\n","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"GEbnDudFf9"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"posterior distribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"uuf8o8XTfa"}],"key":"DHltcaMttb"},{"type":"text","value":" over the parameters. This fully describes the\ninformation we gain about the parameters from observing the reward.","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"xm441X3DxN"}],"key":"cUTpYZYuQX"},{"type":"paragraph","position":{"start":{"line":694,"column":1},"end":{"line":696,"column":1}},"children":[{"type":"text","value":"From this Bayesian perspective, the ","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"I3lp0xrBME"},{"type":"strong","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"children":[{"type":"text","value":"Thompson sampling","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"iWqwJODQPd"}],"key":"aAZYPofDoi"},{"type":"text","value":" algorithm\nfollows naturally: just sample from the distribution of the optimal arm,\ngiven the observations!","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"nooPK9qepY"}],"key":"JpkIcwtmbp"}],"key":"Uo9MB9Mh2C"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Distribution:\n def sample(self) -> Float[Array, \" K\"]:\n \"\"\"Sample a vector of means for the K arms.\"\"\"\n ...\n\n def update(self, arm: int, reward: float):\n \"\"\"Condition on obtaining `reward` from the given arm.\"\"\"\n ...","key":"qCO9iYA4xg"},{"type":"output","id":"wqvS5akPKxoiBV7KCTnco","data":[],"key":"bf46Q29zBH"}],"data":{},"key":"Ptd0Et9M2o"},{"type":"block","children":[],"key":"Tc653fEZxF"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ThompsonSampling(Agent):\n def __init__(self, K: int, T: int, prior: Distribution):\n super().__init__(K, T)\n self.distribution = prior\n\n def choose_arm(self):\n means = self.distribution.sample()\n return random_argmax(means)\n\n def update_history(self, arm: int, reward: int):\n super().update_history(arm, reward)\n self.distribution.update(arm, reward)","key":"PDgeBlFrSJ"},{"type":"output","id":"l2mRR_hDd2d0aVkD6L5WV","data":[],"key":"Bc3gaoruJV"}],"data":{},"key":"njaVLDJXqK"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":724,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"In other words, we sample each arm proportionally to how likely we think\nit is to be optimal, given the observations so far. This strikes a good\nexploration-exploitation tradeoff: we explore more for arms that we’re\nless certain about, and exploit more for arms that we’re more certain\nabout. Thompson sampling is a simple yet powerful algorithm that\nachieves state-of-the-art performance in many settings.","position":{"start":{"line":724,"column":1},"end":{"line":724,"column":1}},"key":"R9ocISkxjc"}],"key":"vqYbsIPRC1"},{"type":"proof","kind":"example","label":"bayesian_bernoulli","identifier":"bayesian_bernoulli","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bayesian Bernoulli bandit","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"YkZ01aGjfU"}],"key":"hYdO6C1qfQ"},{"type":"paragraph","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"We’ve been working in the Bernoulli bandit setting, where arm ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"oQoxKn66ce"},{"type":"inlineMath","value":"k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"kkk","key":"Uo6PdacyRc"},{"type":"text","value":" yields a reward of ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"Jxf0xNYJik"},{"type":"text","value":"1","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"p1xx9Ac440"},{"type":"text","value":" with probability ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"i67MItBmrW"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μk\\mu^kμk","key":"fV8kKMyYuV"},{"type":"text","value":" and no reward otherwise. The vector of success probabilities ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"SVBiMsWDRu"},{"type":"inlineMath","value":"\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μ=(μ1,,μK)\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)μ=(μ1,,μK)","key":"gd8S1Nvb5T"},{"type":"text","value":" thus describes the entire MAB.","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"GuO5R6t7kt"}],"key":"l8zM6gGH3D"},{"type":"paragraph","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"Under the Bayesian perspective, we think of ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"qlQmg0EaSu"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"JkOzNmOBEV"},{"type":"text","value":" as a ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"CDreYJlOqf"},{"type":"emphasis","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"random","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"zuZhP7DMlp"}],"key":"SY1R5ZDEkf"},{"type":"text","value":" vector drawn from some prior distribution ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"jLbSV7lL9T"},{"type":"inlineMath","value":"\\pi(\\boldsymbol{\\mu})","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"π(μ)\\pi(\\boldsymbol{\\mu})π(μ)","key":"xP3ALPESC3"},{"type":"text","value":". For example, we might have ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"tlnF8x6Ez7"},{"type":"text","value":"π","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"umtEwInooj"},{"type":"text","value":" be the Uniform distribution over the unit hypercube ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"HqVjPjz2dP"},{"type":"inlineMath","value":"[0, 1]^K","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"[0,1]K[0, 1]^K[0,1]K","key":"IZxgnn3bBe"},{"type":"text","value":", that is,","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"ySdGfjtOXS"}],"key":"CHR7SUmqsS"},{"type":"math","value":"\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}","position":{"start":{"line":738,"column":1},"end":{"line":741,"column":1}},"html":"π(μ)={1if μ[0,1]K0otherwise\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}π(μ)={10if μ[0,1]Kotherwise","enumerator":"3.25","key":"BJa7scEClH"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"In this case, upon viewing some reward, we can exactly calculate the ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"ZvCEqPN7pH"},{"type":"strong","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"posterior","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"NniOBJjZK5"}],"key":"euxbJenAie"},{"type":"text","value":" distribution of ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"bPfzpJFlSh"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"nzMWjRFKvn"},{"type":"text","value":" using Bayes’s rule (i.e. the definition of conditional probability):","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"vvQVO7mDbg"}],"key":"ZHBAQi3Sqj"},{"type":"math","value":"\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}","position":{"start":{"line":745,"column":1},"end":{"line":750,"column":1}},"html":"P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.","enumerator":"3.26","key":"hwliDAaImW"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"This is the PDF of the\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"WDef7zH5VV"},{"type":"inlineMath","value":"\\text{Beta}(1 + r_0, 1 + (1 - r_0))","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Beta(1+r0,1+(1r0))\\text{Beta}(1 + r_0, 1 + (1 - r_0))Beta(1+r0,1+(1r0))","key":"umMSzzoIyo"},{"type":"text","value":" distribution, which is a conjugate\nprior for the Bernoulli distribution. That is, if we start with a Beta\nprior on ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"EacZotcZvr"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"μk\\mu^kμk","key":"OScIFR9MCQ"},{"type":"text","value":" (note that ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"b3PfTTGmpG"},{"type":"inlineMath","value":"\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Unif([0,1])=Beta(1,1)\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)Unif([0,1])=Beta(1,1)","key":"uWPQsLBVvM"},{"type":"text","value":"),\nthen the posterior, after conditioning on samples from\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"Aq2USTDWxt"},{"type":"inlineMath","value":"\\text{Bern}(\\mu^k)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Bern(μk)\\text{Bern}(\\mu^k)Bern(μk)","key":"llOp1ai016"},{"type":"text","value":", will also be Beta. This is a very convenient\nproperty, since it means we can simply update the parameters of the Beta\ndistribution upon observing a reward, rather than having to recompute\nthe entire posterior distribution from scratch.","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"q74kcWbI98"}],"key":"nWNw7DFCO1"}],"enumerator":"3.3","html_id":"bayesian-bernoulli","key":"Vd16fzQmKR"}],"key":"dkjrEK3lVO"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Beta(Distribution):\n def __init__(self, K: int, alpha: int = 1, beta: int = 1):\n self.alphas = np.full(K, alpha)\n self.betas = np.full(K, beta)\n\n def sample(self):\n return np.random.beta(self.alphas, self.betas)\n\n def update(self, arm: int, reward: int):\n self.alphas[arm] += reward\n self.betas[arm] += 1 - reward","key":"s63oI4x0ui"},{"type":"output","id":"1PmMmTR6hQDnJio5aw7ut","data":[],"key":"bTTKJObJls"}],"data":{},"key":"qVAuo8qsUI"},{"type":"block","children":[],"key":"hd1NBd7wgk"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"beta_distribution = Beta(mab.K)\nagent = ThompsonSampling(mab.K, mab.T, beta_distribution)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"xq98l3hVuk"},{"type":"output","id":"HGfdm3plOpBnSO530LoPb","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"991419959ab213822fb1c34db8883adb","path":"/build/991419959ab213822fb1c34db8883adb.png"}}}],"key":"a29HhM9n8K"}],"data":{},"key":"AKhyoq64A7"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"It turns out that asymptotically, Thompson sampling is optimal in the\nfollowing sense. ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"foUHGuvXsJ"},{"type":"cite","kind":"narrative","label":"lai_asymptotically_1985","identifier":"lai_asymptotically_1985","children":[{"type":"text","value":"Lai & Robbins (1985)","key":"uInCl56ItK"}],"enumerator":"2","key":"UYtgLULFqW"},{"type":"text","value":" prove an\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"MIO2MAjKPM"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"instance-dependent","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"WNfPaXusDp"}],"key":"KLAxTId0pj"},{"type":"text","value":" lower bound that says for ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"QGvjqhfeho"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"B5L9alCxhz"}],"key":"Dc22SkIrcS"},{"type":"text","value":" bandit algorithm,","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"pZJfMIobZn"}],"key":"lcpFRXf1nO"},{"type":"math","value":"\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"lim infTE[NTk]ln(T)1KL(μkμ)\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}Tliminfln(T)E[NTk]KL(μkμ)1","enumerator":"3.27","key":"xMUVppssr5"},{"type":"paragraph","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"key":"sOBlHwpS7q"}],"key":"bpg9yswn47"},{"type":"math","value":"\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}","position":{"start":{"line":792,"column":1},"end":{"line":792,"column":1}},"html":"KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}KL(μkμ)=μklnμμk+(1μk)ln1μ1μk","enumerator":"3.28","key":"DhrS8PFGYD"},{"type":"paragraph","position":{"start":{"line":794,"column":1},"end":{"line":798,"column":1}},"children":[{"type":"text","value":"measures the ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"zSKRzUc6Ho"},{"type":"strong","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"Cc8ePqw2Sy"}],"key":"ac95ZhcwyS"},{"type":"text","value":" from the Bernoulli\ndistribution with mean ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"LEhFKkiEUV"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μk\\mu^kμk","key":"jnXP9Vx2OD"},{"type":"text","value":" to the Bernoulli distribution with mean\n","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"NWwPMEJfrD"},{"type":"inlineMath","value":"\\mu^\\star","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μ\\mu^\\starμ","key":"Gp9rDDHdA7"},{"type":"text","value":". It turns out that Thompson sampling achieves this lower\nbound with equality! That is, not only is the error ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"Gp4CqBuDgu"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"rate","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"reZjZNIp13"}],"key":"OcQiTjRXbS"},{"type":"text","value":" optimal, but\nthe ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"JEdXCM5AUV"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"constant factor","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"T0OwhBNmaT"}],"key":"RlzPJoFoJi"},{"type":"text","value":" is optimal as well.","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"VnTmBmWDL2"}],"key":"SFgAnv7YC4"}],"key":"FGUzP6T2EO"},{"type":"block","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"Contextual bandits","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"lJzt0mXggJ"}],"identifier":"contextual-bandits","label":"Contextual bandits","html_id":"contextual-bandits","implicit":true,"enumerator":"3.8","key":"e9LV0cjy5O"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"Xthvt7g9jC"}],"key":"MhiVZRDEpx"},{"type":"paragraph","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"children":[{"type":"text","value":"This content is advanced material taught at the end of the course.","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"key":"sYT68wQzZe"}],"key":"lA0uV3nhcA"}],"key":"WkfA2QFEPT"},{"type":"paragraph","position":{"start":{"line":808,"column":1},"end":{"line":814,"column":1}},"children":[{"type":"text","value":"In the above MAB environment, the reward distributions of the arms\nremain constant. However, in many real-world settings, we might receive\nadditional information that affects these distributions. For example, in\nthe online advertising case where each arm corresponds to an ad we could\nshow the user, we might receive information about the user’s preferences\nthat changes how likely they are to click on a given ad. We can model\nsuch environments using ","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"kXsMuYKuM2"},{"type":"strong","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"children":[{"type":"text","value":"contextual bandits","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"BAzL5z5tJ0"}],"key":"xlMDDcE6CB"},{"type":"text","value":".","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"M2T8cZEKKY"}],"key":"JgRKJX3gvL"},{"type":"proof","kind":"definition","label":"contextual_bandit","identifier":"contextual_bandit","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contextual bandit","position":{"start":{"line":816,"column":1},"end":{"line":816,"column":1}},"key":"LwptsEeM7C"}],"key":"tM6PW1kmXU"},{"type":"paragraph","position":{"start":{"line":819,"column":1},"end":{"line":824,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"bPafTm7FTY"},{"type":"inlineMath","value":"t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ttt","key":"uaSCMZH63Q"},{"type":"text","value":", a new ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"jUIRnWdWEv"},{"type":"emphasis","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"children":[{"type":"text","value":"context","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"UM6olXeLGe"}],"key":"eJ69BBWYkS"},{"type":"text","value":"\n","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"uRutDObvDf"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"xtx_txt","key":"whrzxKYp1s"},{"type":"text","value":" is drawn from some distribution ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"RXyVII9dk6"},{"type":"inlineMath","value":"\\nu_{\\text{x}}","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"νx\\nu_{\\text{x}}νx","key":"NnLaqdmygz"},{"type":"text","value":". The learner gets\nto observe the context, and choose an action ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"FYs8IdqzBt"},{"type":"inlineMath","value":"a_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ata_tat","key":"tFjQZgaDTe"},{"type":"text","value":" according to some\ncontext-dependent policy ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"b9Z49FmpZS"},{"type":"inlineMath","value":"\\pi_t(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"πt(xt)\\pi_t(x_t)πt(xt)","key":"sP8UdVElnn"},{"type":"text","value":". Then, the learner observes the\nreward from the chosen arm ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"zSl8oLICro"},{"type":"inlineMath","value":"r_t \\sim \\nu^{a_t}(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"rtνat(xt)r_t \\sim \\nu^{a_t}(x_t)rtνat(xt)","key":"uPyxfseIGY"},{"type":"text","value":". The reward\ndistribution also depends on the context.","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"hLwXuhM51i"}],"key":"lpw802SR6J"}],"enumerator":"3.2","html_id":"contextual-bandit","key":"AE1nKi4eWZ"}],"key":"h2YJFxwif9"},{"type":"block","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":829,"column":1},"end":{"line":831,"column":1}},"children":[{"type":"text","value":"Assuming our context is ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"M4PpaTPVKv"},{"type":"emphasis","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"children":[{"type":"text","value":"discrete","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"ikOXlLHsdT"}],"key":"ssZZP7WlTA"},{"type":"text","value":", we can just perform the same\nalgorithms, treating each context-arm pair as its own arm. This gives us\nan enlarged MAB of ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"EF1zSkMvkP"},{"type":"inlineMath","value":"K |\\mathcal{X}|","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"html":"KXK |\\mathcal{X}|KX","key":"jtvW5Uj4cm"},{"type":"text","value":" arms.","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"DST73qnaPW"}],"key":"tAXRp9Hnlx"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"Vn95InZN3f"}],"key":"DQYKTsnyrQ"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"Write down the UCB algorithm for this enlarged MAB. That is, write an\nexpression for ","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"bUPHh9hb2H"},{"type":"inlineMath","value":"\\pi_t(x_t) = \\arg\\max_a \\dots","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"html":"πt(xt)=argmaxa\\pi_t(x_t) = \\arg\\max_a \\dotsπt(xt)=argmaxa","key":"d6jrDfnNt2"},{"type":"text","value":".","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"tcKHz6ZprY"}],"key":"hpzv4ThQZV"}],"key":"yBLTOGTMSn"},{"type":"paragraph","position":{"start":{"line":838,"column":1},"end":{"line":844,"column":1}},"children":[{"type":"text","value":"Recall that running UCB for ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"RlnaTOLlWU"},{"type":"inlineMath","value":"T","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"TTT","key":"z1kxMlLDO0"},{"type":"text","value":" timesteps on an MAB with ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"WEkKgufg1u"},{"type":"inlineMath","value":"K","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"KKK","key":"mOmi9vtVLW"},{"type":"text","value":" arms\nachieves a regret bound of ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"CmOyqWInBJ"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{TK})O~(TK)","key":"kF0GSK3Iv4"},{"type":"text","value":". So in this problem,\nwe would achieve regret ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"okswnaAuBn"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TKX)\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})O~(TKX)","key":"TNSFrADcer"},{"type":"text","value":" in the\ncontextual MAB, which has a polynomial dependence on ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"KvX52lZTRy"},{"type":"inlineMath","value":"|\\mathcal{X}|","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"X|\\mathcal{X}|X","key":"CmvURs45Ni"},{"type":"text","value":".\nBut in a situation where we have large, or even infinitely many\ncontexts, e.g. in the case where our context is a continuous value, this\nbecomes intractable.","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"Kn2Id4WkJG"}],"key":"H4uCi4dzXE"},{"type":"paragraph","position":{"start":{"line":846,"column":1},"end":{"line":850,"column":1}},"children":[{"type":"text","value":"Note that this “enlarged MAB” treats the different contexts as entirely\nunrelated to each other, while in practice, often contexts are ","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"n2FkcjLKnj"},{"type":"emphasis","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"children":[{"type":"text","value":"related","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"w5u7diShor"}],"key":"wiCZjmXhKi"},{"type":"text","value":"\nto each other in some way: for example, we might want to advertise\nsimilar products to users with similar preferences. How can we\nincorporate this structure into our solution?","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"c088eLTrXK"}],"key":"zHKdQP8ORL"}],"key":"nppRzYxkG8"},{"type":"block","position":{"start":{"line":852,"column":1},"end":{"line":852,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"children":[{"type":"text","value":"Linear contextual bandits","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"hb09FpEFde"}],"label":"lin_ucb","identifier":"lin_ucb","html_id":"lin-ucb","enumerator":"3.8.1","key":"cRfex1pJWm"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"We want to model the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ve24fROls5"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"mean reward","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"zjLjzAy8tm"}],"key":"tZWVCVcnAo"},{"type":"text","value":" of arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"jDJmb1S5wT"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"An3fFNMxw3"},{"type":"text","value":" as a function of the\ncontext, i.e. ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"qvSJaPhxiG"},{"type":"inlineMath","value":"\\mu^k(x)","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)\\mu^k(x)μk(x)","key":"miqkBjvqZa"},{"type":"text","value":". One simple model is the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"PRMRGv5sZz"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"mNqZvI0OSA"}],"key":"itRyoAjrAa"},{"type":"text","value":" one:\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"njN9ODxIqh"},{"type":"inlineMath","value":"\\mu^k(x) = x^\\top \\theta^k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)=xθk\\mu^k(x) = x^\\top \\theta^kμk(x)=xθk","key":"UdkTDDarOn"},{"type":"text","value":", where ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"r35xLOkWMs"},{"type":"inlineMath","value":"x \\in \\mathcal{X} = \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"xX=Rdx \\in \\mathcal{X} = \\mathbb{R}^dxX=Rd","key":"xjLz5YOmxg"},{"type":"text","value":" and\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ir0Mzed20c"},{"type":"inlineMath","value":"\\theta^k \\in \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"θkRd\\theta^k \\in \\mathbb{R}^dθkRd","key":"eOXnRwoCvK"},{"type":"text","value":" describes a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"lMorcBP94o"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"feature direction","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"aABITnPRTv"}],"key":"gewx4ChjPB"},{"type":"text","value":" for arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"wBJ06JbEEj"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"Sb97viNR2o"},{"type":"text","value":". Recall\nthat ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"F7INHUuF9t"},{"type":"strong","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"xFkfhnqQnD"}],"key":"nbOHImGU4T"},{"type":"text","value":" gives us a way to estimate a conditional\nexpectation from samples: We learn a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"xkn1eVUZ0G"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"Qmxxumm4Pc"}],"key":"Li71WVnmSa"},{"type":"text","value":" estimator from the\ntimesteps where arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"tDcWbUgEvw"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"mfKiFEvrkM"},{"type":"text","value":" was selected:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"b4LKyKHMne"}],"key":"oFk60JAAia"},{"type":"math","value":"\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"tight":true,"html":"θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.θ^tk=argθRdmin{i[t]:ai=k}(rixiθ)2.","enumerator":"3.29","key":"xnV667nxQ7"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"This has the closed-form solution known as the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ZDCZ3C9TKj"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"ordinary least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"blZrYwum2C"}],"key":"RLFgfbNre7"},{"type":"text","value":"\n(OLS) estimator:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"jG1Et8MCch"}],"key":"la6FzH4bg6"},{"type":"math","value":"\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}","label":"ols_bandit","identifier":"ols_bandit","html":"θ^tk=(Atk)1{i[t]:ai=k}xiriwhereAtk={i[t]:ai=k}xixi.\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}θ^tkwhereAtk=(Atk)1{i[t]:ai=k}xiri={i[t]:ai=k}xixi.","enumerator":"3.30","html_id":"ols-bandit","key":"Ma5P8EDgFw"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"We can now apply the UCB algorithm in this environment in order to\nbalance ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"jZIJOEZsWk"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"aGWBVXnpi5"}],"key":"Wy6ThxSGs5"},{"type":"text","value":" of new arms and ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"eFjAORzs5F"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"vOReg9XrVb"}],"key":"wWe3CWvljY"},{"type":"text","value":" of arms that we\nbelieve to have high reward. But how should we construct the upper\nconfidence bound? Previously, we treated the pulls of an arm as i.i.d.\nsamples and used Hoeffding’s inequality to bound the distance of the\nsample mean, our estimator, from the true mean. However, now our\nestimator is not a sample mean, but rather the OLS estimator above ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"Dh7F9Jc32a"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"LCOtBmyjUF"},{"type":"text","value":"3.30","key":"S4lS9iAAtg"},{"type":"text","value":")","key":"dbDh0is9MZ"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"ANkIn4K9rR"},{"type":"text","value":". Instead, we’ll use ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"iAuonfjPje"},{"type":"strong","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"Chebyshev’s\ninequality","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"n0XgWHGKRR"}],"key":"jv4iRFVLdT"},{"type":"text","value":" to construct an upper confidence bound.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"u1VqdumhE8"}],"key":"wRL74C3LJ6"},{"type":"proof","kind":"theorem","label":"chebyshev","identifier":"chebyshev","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Chebyshev’s inequality","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"key":"VBNwDKJLbS"}],"key":"UTv9ihzauh"},{"type":"paragraph","position":{"start":{"line":889,"column":1},"end":{"line":891,"column":1}},"children":[{"type":"text","value":"For a random variable ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"Qpazz15kr8"},{"type":"inlineMath","value":"Y","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"YYY","key":"vumxns3IK8"},{"type":"text","value":" such that\n","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"ySVv7PD6uM"},{"type":"inlineMath","value":"\\E Y = 0","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY=0\\E Y = 0EY=0","key":"WOrOK9ZgMU"},{"type":"text","value":" and ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"YyHLKOTH1i"},{"type":"inlineMath","value":"\\E Y^2 = \\sigma^2","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY2=σ2\\E Y^2 = \\sigma^2EY2=σ2","key":"btXYLoKaDG"},{"type":"text","value":",","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"TOtZ1MOilc"}],"key":"EDoy2kp3tJ"},{"type":"math","value":"|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"tight":"before","html":"Yβσwith probability11β2|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}Yβσwith probability1β21","enumerator":"3.31","key":"Uo4yJJFNVo"}],"enumerator":"3.3","html_id":"chebyshev","key":"ptlq6L9ySn"},{"type":"paragraph","position":{"start":{"line":894,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"Since the OLS estimator is known to be unbiased (try proving this\nyourself), we can apply Chebyshev’s inequality to\n","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"kapI4qpWxT"},{"type":"inlineMath","value":"x_t^\\top (\\hat \\theta_t^k - \\theta^k)","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"html":"xt(θ^tkθk)x_t^\\top (\\hat \\theta_t^k - \\theta^k)xt(θ^tkθk)","key":"T8T5LHXUJ8"},{"type":"text","value":":","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"wGZtG4CYz1"}],"key":"xkvq6eRzf9"},{"type":"math","value":"\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":900,"column":1}},"html":"xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}xtθkxtθ^tk+βxt(Atk)1xtwith probability1β21","enumerator":"3.32","key":"NHKzsY2KCm"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"srTkGGrq7z"}],"key":"ntrnkwjaKX"},{"type":"paragraph","position":{"start":{"line":903,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"We haven’t explained why ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"p92h3nQ0eU"},{"type":"inlineMath","value":"x_t^\\top (A_t^k)^{-1} x_t","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xt(Atk)1xtx_t^\\top (A_t^k)^{-1} x_txt(Atk)1xt","key":"TCLC0WUOyJ"},{"type":"text","value":" is the correct\nexpression for the variance of ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"OhhxElnrzs"},{"type":"inlineMath","value":"x_t^\\top \\hat \\theta_t^k","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xtθ^tkx_t^\\top \\hat \\theta_t^kxtθ^tk","key":"bjyxBbMfT3"},{"type":"text","value":". This result\nfollows from some algebra on the definition of the OLS estimator ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"vTeNkaEFp6"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"YhLK6i0l24"},{"type":"text","value":"3.30","key":"UsFN0lJgr0"},{"type":"text","value":")","key":"iiWBD4mJBO"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"MC8Ne1BpgG"},{"type":"text","value":".","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"Zs2vxGECZ7"}],"key":"cdFP8etNXC"}],"key":"lOCFbMwec8"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"The first term is exactly our predicted reward ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"JRXtbpOktn"},{"type":"inlineMath","value":"\\hat \\mu^k_t(x_t)","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"μ^tk(xt)\\hat \\mu^k_t(x_t)μ^tk(xt)","key":"DD3yskHvU3"},{"type":"text","value":". To\ninterpret the second term, note that","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Gxo7770hbF"}],"key":"R0fL6bj7JZ"},{"type":"math","value":"x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,xt(Atk)1xt=Ntk1xt(Σtk)1xt,","enumerator":"3.33","key":"EboiwS09Ua"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"p46G01VkwQ"}],"key":"ee68TE6hXm"},{"type":"math","value":"\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"Σtk=1Ntk{i[t]:ai=k}xixi\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\topΣtk=Ntk1{i[t]:ai=k}xixi","enumerator":"3.34","key":"n9xEpDed9b"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"is the empirical covariance matrix of the contexts (assuming that the\ncontext has mean zero). That is, the learner is encouraged to choose\narms when ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"abqsGDhEi8"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"xtx_txt","key":"kz4vcex6pW"},{"type":"text","value":" is ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"I6UC5CU9TH"},{"type":"emphasis","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"children":[{"type":"text","value":"not aligned","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Qf6Z3bQJN1"}],"key":"A9STEmWXLd"},{"type":"text","value":" with the data seen so far, or if arm\n","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"mZ45YKycvP"},{"type":"inlineMath","value":"k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"kkk","key":"Wbh7WXvo4Z"},{"type":"text","value":" has not been explored much and so ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"u3B7SmSA6z"},{"type":"inlineMath","value":"N_t^k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"NtkN_t^kNtk","key":"APkWQ0rePX"},{"type":"text","value":" is small.","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"cdasE8hSzk"}],"key":"Uw7uAvteKH"},{"type":"paragraph","position":{"start":{"line":918,"column":1},"end":{"line":919,"column":1}},"children":[{"type":"text","value":"We can now substitute these quantities into UCB to get the ","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"BJZiKd17mM"},{"type":"strong","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"LinUCB","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"kWKlVH7wBE"}],"key":"rzZr9b5lAT"},{"type":"text","value":"\nalgorithm:","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"EvorEYXAJc"}],"key":"MAsGvsE4sQ"}],"key":"NDTARNZui2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class LinUCBPseudocode(Agent):\n def __init__(\n self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]\n ):\n super().__init__(K, T)\n self.lam = lam\n self.get_c = get_c\n self.contexts = [None for _ in range(K)]\n self.A = np.repeat(lam * np.eye(D)[...], K)\n self.targets = np.zeros(K, D)\n self.w = np.zeros(K, D)\n\n def choose_arm(self, context: Float[Array, \" D\"]):\n c = self.get_c(self.count)\n scores = self.w @ context + c * np.sqrt(\n context.T @ np.linalg.solve(self.A, context)\n )\n return random_argmax(scores)\n\n def update_history(self, context: Float[Array, \" D\"], arm: int, reward: int):\n self.A[arm] += np.outer(context, context)\n self.targets[arm] += context * reward\n self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])","key":"SGSkwlt0AN"},{"type":"output","id":"3MX4RSRKv0TYiDnlhuby4","data":[],"key":"Et4Hra0reA"}],"data":{},"key":"Ywe0yOM2Ii"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"mpSPm2eNzz"}],"key":"ppCNsjHqju"},{"type":"paragraph","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Note that the matrix ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"bXjsA7i185"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"E4vtYMvYUZ"},{"type":"text","value":" above might not be invertible. When does this occur? One way to address this is to include a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"QAJ6RaMswc"},{"type":"inlineMath","value":"\\lambda I","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"λI\\lambda IλI","key":"J2cizETQ0J"},{"type":"text","value":" regularization term to ensure that ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"okSZgvBPQX"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"mDLA0gHBXo"},{"type":"text","value":" is invertible. This is equivalent to solving a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"PDeLcaI375"},{"type":"emphasis","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"ridge regression","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"o7My5osfmn"}],"key":"YXb3QqJK4J"},{"type":"text","value":" problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"yz3rgrCCWw"}],"key":"E8FrPUh7gl"}],"key":"zjPW5CLmND"}],"key":"Zb4Ga5IDJJ"},{"type":"block","position":{"start":{"line":951,"column":1},"end":{"line":951,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":953,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"inlineMath","value":"c_t","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"ctc_tct","key":"WquyfIPyJc"},{"type":"text","value":" is similar to the ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"AkJeWgn6Uv"},{"type":"inlineMath","value":"\\log (2t/\\delta')","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"log(2t/δ)\\log (2t/\\delta')log(2t/δ)","key":"wuohpaqCul"},{"type":"text","value":" term of UCB: It controls the\nwidth of the confidence interval. Here, we treat it as a tunable\nparameter, though in a theoretical analysis, it would depend on ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"AxGhoLBwTb"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"AtkA_t^kAtk","key":"C9t27xhFN6"},{"type":"text","value":"\nand the probability ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"IWfCuGYcta"},{"type":"text","value":"δ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"QeSHtoLiBx"},{"type":"text","value":" with which the bound holds.","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"zrb25ANtD9"}],"key":"wnj7ViXpSm"},{"type":"paragraph","position":{"start":{"line":958,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Using similar tools for UCB, we can also prove an ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"cl6NB8zuCT"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{T})","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"html":"O~(T)\\tilde{O}(\\sqrt{T})O~(T)","key":"vWekOLjtiP"},{"type":"text","value":"\nregret bound. The full details of the analysis can be found in Section 3 of ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"HqB4I5mNzj"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"ZX1hNQCndA"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"TwAs0Ox69Y"}],"key":"TE1KamsOjP"},{"type":"text","value":" (2022)","key":"lx0vbXC6lm"}],"enumerator":"3","key":"zbQ4Ck40zp"},{"type":"text","value":".","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"jBo4gHK5yN"}],"key":"kNMI29Lkd9"},{"type":"heading","depth":2,"position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"key":"XsmHUm3rEW"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"3.9","key":"nGSZbnfj42"},{"type":"paragraph","position":{"start":{"line":963,"column":1},"end":{"line":964,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored the ","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"SEwmDvuTlE"},{"type":"strong","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"J5DViuH2YD"}],"key":"OkeTSZ3q4b"},{"type":"text","value":" setting for analyzing sequential decision-making in an unknown environment.","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"qOwc71TLeN"}],"key":"OVSSmOHPRx"}],"key":"DGLY7twmMf"}],"key":"fqhCm5iabt"},"references":{"cite":{"order":["vershynin_high-dimensional_2018","lai_asymptotically_1985","agarwal_reinforcement_2022"],"data":{"vershynin_high-dimensional_2018":{"label":"vershynin_high-dimensional_2018","enumerator":"1","html":"Vershynin, R. (2018). High-Dimensional Probability: An Introduction with Applications in Data Science. Cambridge University Press."},"lai_asymptotically_1985":{"label":"lai_asymptotically_1985","enumerator":"2","doi":"10.1016/0196-8858(85)90002-8","html":"Lai, T. L., & Robbins, H. (1985). Asymptotically Efficient Adaptive Allocation Rules. Advances in Applied Mathematics, 6(1), 4–22. 10.1016/0196-8858(85)90002-8","url":"https://doi.org/10.1016/0196-8858(85)90002-8"},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"3","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."}}}},"footer":{"navigation":{"prev":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/build/_shared/chunk-4KX4SC5D.js b/build/_shared/chunk-4KX4SC5D.js deleted file mode 100644 index af56298..0000000 --- a/build/_shared/chunk-4KX4SC5D.js +++ /dev/null @@ -1,4 +0,0 @@ -import{A as it,B as ct,C as ve,D as st,E as lt,F as dt,G as mt,H as pt,I as ft,J as ut,K as ht,L as gt,M as vt,N as xt,O as wt,P as bt,Q as Rt,R as yt,T as Ct,U as Pt,V as oe,W as Nt,X as kt,Y as Ot,Z as _t,a as ur,aa as At,b as Ve,ba as Et,c as ce,ca as Mt,d as Ue,da as St,e as Ge,ea as It,f as Ke,g as se,ga as Dt,ha as jt,i as We,j as Je,ja as Ft,k as Ye,l as Ze,m as qe,n as Qe,o as Xe,p as et,q as z,r as tt,s as rt,t as fe,u as ot,v as ue,w as he,x as nt,y as ge,z as at}from"/build/_shared/chunk-DCZNW6LG.js";import{g as T}from"/build/_shared/chunk-HTHE5KDW.js";import{a as le}from"/build/_shared/chunk-3CVK3PYF.js";import{k as $e}from"/build/_shared/chunk-NF5NQVJX.js";import{a as Be,e as Le,l as He}from"/build/_shared/chunk-OCTKKCIL.js";import{a as b,d as N}from"/build/_shared/chunk-UAI5KRM7.js";import{e as l}from"/build/_shared/chunk-2NH4LW52.js";var Tt={rel:"stylesheet",href:"https://cdn.jsdelivr.net/npm/katex@0.15.2/dist/katex.min.css",integrity:"sha384-MlJdn/WNKDGXveldHDdyRP1R4CTHr3FeuDNfhsLPYrq2t0UBkUdK2jyTnXPEK1NQ",crossOrigin:"anonymous"};var c=l(N(),1),Q=l(le(),1);var B=l(b(),1);function hr({title:e,titleId:r,...n},t){return B.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?B.createElement("title",{id:r},e):null,B.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.8c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9-1.8-1.7-2.8-4-2.8-6.7s1-5 2.9-6.9C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.3C4.8 8 4 9.9 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4 1.5-1.5 2.3-3.3 2.3-5.6 0-2.2-.8-4.1-2.3-5.7C16.1 4.8 14.2 4 12 4zm2.6 5.6v4h-1.1v4.7h-3v-4.7H9.4v-4c0-.2.1-.3.2-.4.1-.2.2-.2.4-.2h4c.2 0 .3.1.4.2.2.1.2.2.2.4zm-4-2.5c0-.9.5-1.4 1.4-1.4s1.4.5 1.4 1.4c0 .9-.5 1.4-1.4 1.4s-1.4-.5-1.4-1.4z"}))}var gr=B.forwardRef(hr),xe=gr;var L=l(b(),1);function vr({title:e,titleId:r,...n},t){return L.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?L.createElement("title",{id:r},e):null,L.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.8c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9-1.9-1.9-2.9-4.2-2.9-6.9s1-5 2.9-6.9c2-1.7 4.3-2.7 7-2.7zM4.4 9.4C4.2 10.2 4 11 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4.6-.5 1-1.1 1.3-1.7l-3.7-1.6c-.1.6-.4 1.1-.9 1.5-.5.4-1.1.6-1.8.7V18h-1.1v-1.5c-1.1 0-2.1-.4-3-1.2l1.3-1.4c.6.6 1.4.9 2.2.9.3 0 .6-.1.9-.2.2-.2.4-.4.4-.7 0-.2-.1-.4-.3-.6l-.9-.4-1.1-.6-1.5-.7-5.1-2.2zM12 4c-2.2 0-4.1.8-5.6 2.3-.4.4-.7.9-1.1 1.3L9 9.3c.2-.5.5-.9 1-1.2.5-.3 1-.5 1.6-.5V6.1h1.1v1.5c.9 0 1.7.3 2.4.9l-1.3 1.3c-.5-.4-1.1-.6-1.7-.6-.3 0-.6.1-.8.2-.2.1-.3.3-.3.6 0 .1 0 .2.1.2l1.2.6.9.4 1.6.7 5 2.2c.2-.7.2-1.4.2-2.1 0-2.2-.8-4.1-2.3-5.7C16.1 4.8 14.2 4 12 4z"}))}var xr=L.forwardRef(vr),we=xr;var H=l(b(),1);function wr({title:e,titleId:r,...n},t){return H.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?H.createElement("title",{id:r},e):null,H.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.9c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9C3.2 17 2.2 14.7 2.2 12s1-5 2.9-6.9C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.4C4.8 8 4 9.9 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4 1.5-1.5 2.3-3.3 2.3-5.6 0-2.2-.8-4.1-2.3-5.6C16.1 4.8 14.2 4 12 4zm3.7 5.7v1.7H8.6V9.7h7.1zm0 3.1v1.7H8.6v-1.7h7.1z"}))}var br=H.forwardRef(wr),be=br;var $=l(b(),1);function Rr({title:e,titleId:r,...n},t){return $.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?$.createElement("title",{id:r},e):null,$.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.9c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9C3.2 17 2.2 14.7 2.2 12s1-5 2.9-6.9C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.4C4.8 8 4 9.9 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4 1.5-1.5 2.3-3.3 2.3-5.6 0-2.2-.8-4.1-2.3-5.6C16.1 4.8 14.2 4 12 4zm-4.3 6.6c.2-1.2.7-2.1 1.4-2.8.8-.7 1.7-1 2.8-1 1.5 0 2.8.5 3.7 1.5.9 1 1.4 2.3 1.4 3.8s-.5 2.7-1.4 3.7c-.9 1-2.2 1.5-3.7 1.5-1.1 0-2.1-.3-2.9-1-.8-.7-1.3-1.6-1.4-2.8h2.5c.1 1.2.8 1.8 2.1 1.8.7 0 1.2-.3 1.7-.9.4-.6.6-1.4.6-2.4s-.2-1.8-.6-2.4c-.4-.5-.9-.8-1.7-.8-1.3 0-2 .6-2.2 1.7h.7l-1.9 1.9-1.9-1.9.8.1z"}))}var yr=$.forwardRef(Rr),Re=yr;var I=l(b(),1);function Cr({title:e,titleId:r,...n},t){return I.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?I.createElement("title",{id:r},e):null,I.createElement("path",{d:"M12 6.4c-3.2 0-4 3-4 5.6 0 2.6.8 5.6 4 5.6s4-3 4-5.6c0-2.6-.8-5.6-4-5.6zm0 2.1h.4c.2.2.3.5.1.9l-2.1 3.9c-.1-.5-.1-1-.1-1.4 0-1 0-3.4 1.7-3.4zm1.6 1.8c.1.6.1 1.2.1 1.7 0 1.1-.1 3.5-1.7 3.5h-.4-.1-.1c-.4-.2-.6-.4-.3-.9l2.5-4.3z"}),I.createElement("path",{d:"M12 2.2c-2.7 0-5 .9-6.8 2.8-1 1-1.7 2.1-2.2 3.3-.5 1.2-.8 2.4-.8 3.7 0 1.3.2 2.5.7 3.7.5 1.2 1.2 2.2 2.1 3.2.9.9 2 1.6 3.2 2.1 1.2.5 2.4.7 3.7.7 1.3 0 2.5-.3 3.7-.8 1.2-.5 2.3-1.2 3.2-2.2.9-.9 1.6-1.9 2.1-3.1.5-1.2.7-2.4.7-3.8 0-1.3-.2-2.6-.7-3.7-.3-1-1-2.1-1.9-3-2-1.9-4.3-2.9-7-2.9zM12 4c2.2 0 4.1.8 5.7 2.3.7.8 1.3 1.7 1.7 2.6.4 1 .6 2 .6 3.1 0 2.2-.8 4.1-2.3 5.6-.8.8-1.7 1.4-2.7 1.8-1 .4-2 .6-3 .6-1.1 0-2.1-.2-3-.6-1-.4-1.8-1-2.6-1.7C5.6 16.9 5 16 4.6 15c-.4-1-.6-2-.6-3 0-1.1.2-2.1.6-3 .4-1 1-1.9 1.8-2.6C7.9 4.8 9.8 4 12 4z"}))}var Pr=I.forwardRef(Cr),ye=Pr;var V=l(b(),1);function Nr({title:e,titleId:r,...n},t){return V.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?V.createElement("title",{id:r},e):null,V.createElement("path",{d:"M12 2.2c2.7 0 5 1 7 2.9.9.9 1.6 2 2.1 3.1.5 1.2.7 2.4.7 3.8 0 1.3-.2 2.6-.7 3.8-.5 1.2-1.2 2.2-2.1 3.1-1 .9-2 1.7-3.2 2.2-1.2.5-2.5.7-3.7.7s-2.6-.3-3.8-.8c-1.2-.5-2.2-1.2-3.2-2.1s-1.6-2-2.1-3.2-.8-2.4-.8-3.7c0-1.3.2-2.5.7-3.7S4.2 6 5.1 5.1C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.3C5.6 7.1 5 8 4.6 9c-.4 1-.6 2-.6 3s.2 2.1.6 3c.4 1 1 1.8 1.8 2.6S8 19 9 19.4c1 .4 2 .6 3 .6s2.1-.2 3-.6c1-.4 1.9-1 2.7-1.8 1.5-1.5 2.3-3.3 2.3-5.6 0-1.1-.2-2.1-.6-3.1-.4-1-1-1.8-1.7-2.6C16.1 4.8 14.2 4 12 4zm-.1 6.4l-1.3.7c-.1-.3-.3-.5-.5-.6-.2-.1-.4-.2-.6-.2-.9 0-1.3.6-1.3 1.7 0 .5.1.9.3 1.3.2.3.5.5 1 .5.6 0 1-.3 1.2-.8l1.2.6c-.3.5-.6.9-1.1 1.1-.5.3-1 .4-1.5.4-.9 0-1.6-.3-2.1-.8-.5-.6-.8-1.3-.8-2.3 0-.9.3-1.7.8-2.2.6-.6 1.3-.8 2.1-.8 1.2 0 2.1.4 2.6 1.4zm5.6 0l-1.3.7c-.1-.3-.3-.5-.5-.6-.2-.1-.4-.2-.6-.2-.9 0-1.3.6-1.3 1.7 0 .5.1.9.3 1.3.2.3.5.5 1 .5.6 0 1-.3 1.2-.8l1.2.6c-.3.5-.6.9-1.1 1.1-.4.2-.9.3-1.4.3-.9 0-1.6-.3-2.1-.8s-.8-1.3-.8-2.2c0-.9.3-1.7.8-2.2.5-.5 1.2-.8 2-.8 1.2 0 2.1.4 2.6 1.4z"}))}var kr=V.forwardRef(Nr),Ce=kr;var D=l(b(),1);function Or({title:e,titleId:r,...n},t){return D.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?D.createElement("title",{id:r},e):null,D.createElement("path",{d:"M21.8 18c0 1.1-.9 2-1.9 2H4.2c-1.1 0-1.9-.9-1.9-2V9.9c0-.5.3-.7.8-.4l7.8 4.7c.7.4 1.7.4 2.4 0L21 9.5c.4-.2.8-.1.8.4V18z"}),D.createElement("path",{d:"M21.8 6c0-1.1-.9-2-1.9-2H4.2c-1.1 0-2 .9-2 2v.4c0 .5.3 1.1.8 1.3l8.5 5.1c.2.1.7.1.9 0l8.6-5c.4-.3.8-.9.8-1.3-.1-.1-.1-.5 0-.5z"}))}var _r=D.forwardRef(Or),Pe=_r;var U=l(b(),1);function Ar({title:e,titleId:r,...n},t){return U.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?U.createElement("title",{id:r},e):null,U.createElement("path",{d:"M12 2.5c-5.4 0-9.8 4.4-9.8 9.7 0 4.3 2.8 8 6.7 9.2.5.1.7-.2.7-.5v-1.8c-2.4.5-3.1-.6-3.3-1.1-.1-.3-.6-1.1-1-1.4-.3-.2-.8-.6 0-.6s1.3.7 1.5 1c.9 1.5 2.3 1.1 2.8.8.1-.6.3-1.1.6-1.3-2.2-.2-4.4-1.1-4.4-4.8 0-1.1.4-1.9 1-2.6-.1-.2-.4-1.2.1-2.6 0 0 .8-.3 2.7 1 .8-.2 1.6-.3 2.4-.3.8 0 1.7.1 2.4.3 1.9-1.3 2.7-1 2.7-1 .5 1.3.2 2.3.1 2.6.6.7 1 1.5 1 2.6 0 3.7-2.3 4.6-4.4 4.8.4.3.7.9.7 1.8V21c0 .3.2.6.7.5 3.9-1.3 6.6-4.9 6.6-9.2 0-5.4-4.4-9.8-9.8-9.8z"}))}var Er=U.forwardRef(Ar),Ne=Er;var G=l(b(),1);function Mr({title:e,titleId:r,...n},t){return G.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?G.createElement("title",{id:r},e):null,G.createElement("path",{d:"M20.2 1.7c0 .8-.5 1.4-1.3 1.5-.8 0-1.4-.5-1.5-1.3 0-.8.5-1.4 1.3-1.5.8-.1 1.5.5 1.5 1.3zM12 17.9c-3.7 0-7-1.3-8.7-3.3 1.8 4.8 7.1 7.3 11.9 5.5 2.5-.9 4.5-2.9 5.5-5.5-1.7 2-4.9 3.3-8.7 3.3zM12 5.1c3.7 0 7 1.3 8.7 3.3-1.8-4.8-7.1-7.3-11.9-5.5-2.5.9-4.5 2.9-5.5 5.5 1.7-2 5-3.3 8.7-3.3zM6.9 21.8c.1 1-.7 1.8-1.7 1.9-1 .1-1.8-.7-1.9-1.7 0-1 .7-1.8 1.7-1.9 1-.1 1.8.7 1.9 1.7zM3.7 4.6c-.6 0-1-.4-1-1s.4-1 1-1 1 .4 1 1c0 .5-.4 1-1 1z"}))}var Sr=G.forwardRef(Mr),ke=Sr;var E=l(b(),1);function Ir({title:e,titleId:r,...n},t){return E.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?E.createElement("title",{id:r},e):null,E.createElement("path",{d:"M17.1 12.6h-2V7.5c0-1.7-1.4-3.1-3-3.1-.8 0-1.6.3-2.2.9-.6.5-.9 1.3-.9 2.2v.7H7v-.7c0-1.4.5-2.7 1.5-3.7s2.2-1.5 3.6-1.5 2.6.5 3.6 1.5 1.5 2.3 1.5 3.7v5.1z"}),E.createElement("path",{d:"M12 21.8c-.8 0-1.6-.2-2.3-.5-.7-.3-1.4-.8-1.9-1.3-.6-.6-1-1.2-1.3-2-.3-.8-.5-1.6-.5-2.4s.2-1.6.5-2.4c.3-.7.7-1.4 1.3-2s1.2-1 1.9-1.3c.7-.3 1.5-.5 2.3-.5.8 0 1.6.2 2.3.5.7.3 1.4.8 1.9 1.3.6.6 1 1.2 1.3 2 .3.8.5 1.6.5 2.4s-.2 1.6-.5 2.4c-.3.7-.7 1.4-1.3 2-.6.6-1.2 1-1.9 1.3-.7.3-1.5.5-2.3.5zm0-10.3c-2.2 0-4 1.8-4 4.1s1.8 4.1 4 4.1 4-1.8 4-4.1-1.8-4.1-4-4.1z"}),E.createElement("circle",{cx:12,cy:15.6,r:1.7}))}var Dr=E.forwardRef(Ir),Oe=Dr;var K=l(b(),1);function jr({title:e,titleId:r,...n},t){return K.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?K.createElement("title",{id:r},e):null,K.createElement("path",{d:"M21.8 12c0 5.4-4.4 9.8-9.8 9.8S2.2 17.4 2.2 12 6.6 2.2 12 2.2s9.8 4.4 9.8 9.8zM8.2 5.8c-.4 0-.8.3-.8.8s.3.8.8.8.8-.4.8-.8-.3-.8-.8-.8zm2.3 9.6h1.2v-6h1.8c2.3 0 3.3 1.4 3.3 3s-1.5 3-3.3 3h-3v1.1H9V8.3H7.7v8.2h5.9c3.3 0 4.5-2.2 4.5-4.1s-1.2-4.1-4.3-4.1h-3.2l-.1 7.1z"}))}var Fr=K.forwardRef(jr),_e=Fr;var W=l(b(),1);function Tr({title:e,titleId:r,...n},t){return W.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?W.createElement("title",{id:r},e):null,W.createElement("path",{d:"M13.2 15.6c1.4-.5 2.1-1.6 2.1-3.3S13.8 8.9 12 8.9c-1.9 0-3.3 1.6-3.3 3.3 0 1.8.8 3 2.2 3.4l-2.3 5.9c-3.1-.8-6.3-4.6-6.3-9.3 0-5.5 4.3-10 9.7-10s9.8 4.5 9.8 10c0 4.7-3.1 8.5-6.3 9.3l-2.3-5.9z"}))}var zr=W.forwardRef(Tr),Ae=zr;var J=l(b(),1);function Br({title:e,titleId:r,...n},t){return J.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?J.createElement("title",{id:r},e):null,J.createElement("path",{d:"M10 4.2L8.3 6.8 6.6 4.2H10zM17.1 4.2l-1.7 2.6-1.7-2.6h3.4zM6.6 19.8l1.7-2.6 1.7 2.6H6.6zM13.7 19.8l1.7-2.6 1.7 2.6h-3.4zM20.8 12.5c.6-.1 1.1-.4 1.4-.8.3-.4.5-.9.5-1.5 0-.5-.1-.9-.3-1.2-.2-.3-.4-.6-.7-.8-.3-.2-.6-.3-1-.4-.4-.1-.8-.1-1.2-.1h-3.3v2.6c0-.1-.1-.2-.1-.2-.2-.6-.6-1-1-1.4-.4-.4-.9-.7-1.5-.9-.6-.2-1.2-.3-1.9-.3s-1.3.1-1.9.3c-.5.1-1 .4-1.4.8-.3.4-.6.8-.9 1.3 0-.3-.1-.6-.2-.9-.2-.4-.4-.6-.7-.8-.3-.2-.6-.3-1-.4s-.8-.2-1.3-.2H1v8.5h1.9v-3.4h.9l1.8 3.4h2.3l-2.2-3.6c.6-.1 1.1-.4 1.4-.8v-.1.2c0 .7.1 1.3.3 1.8.2.6.6 1 1 1.4.4.4.9.7 1.5.9.6.2 1.2.3 1.9.3s1.3-.1 1.9-.3c.6-.2 1.1-.5 1.5-.9.4-.4.7-.9 1-1.4 0-.1.1-.2.1-.2V16H18v-3.4h.9l1.8 3.4H23l-2.2-3.5zM5.4 10.7c-.1.2-.2.3-.3.3-.2.1-.3.1-.5.1H2.9V9.2h1.7c.2 0 .3.1.5.1.1.1.3.2.3.3.1.1.1.3.1.5.1.3 0 .5-.1.6zm8.8 2.3c-.1.3-.3.6-.5.9-.2.2-.5.4-.8.6-.3.1-.7.2-1.1.2-.4 0-.8-.1-1.1-.2-.3-.1-.6-.3-.8-.6-.2-.2-.4-.5-.5-.9-.1-.3-.2-.7-.2-1.1 0-.4.1-.8.2-1.1s.3-.6.5-.9c.2-.2.5-.4.8-.6.3-.1.7-.2 1.1-.2.4 0 .8.1 1.1.2.3.1.6.3.8.6.2.2.4.5.5.9.1.3.2.7.2 1.1 0 .4 0 .7-.2 1.1zm6.4-2.3c-.1.1-.2.2-.4.3-.2.1-.3.1-.5.1H18V9.2h1.7c.2 0 .3.1.5.1.1.1.3.2.3.3.1.1.1.3.1.5.1.3.1.5 0 .6z"}))}var Lr=J.forwardRef(Br),Ee=Lr;var Y=l(b(),1);function Hr({title:e,titleId:r,...n},t){return Y.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:t,"aria-labelledby":r},n),e?Y.createElement("title",{id:r},e):null,Y.createElement("path",{d:"M22.7 5.4c-.8.3-1.7.6-2.5.7.9-.5 1.6-1.4 1.9-2.4-.9.5-1.8.9-2.8 1.1-1.7-1.8-4.4-1.9-6.2-.2-1.1 1.1-1.6 2.7-1.3 4.2-3.5-.3-6.8-1.9-9-4.7-.4.7-.6 1.5-.6 2.2 0 1.5.7 2.8 1.9 3.6-.7 0-1.4-.2-2-.5v.1c0 2.1 1.5 3.9 3.5 4.3-.6.2-1.3.2-2 .1.6 1.8 2.2 3 4.1 3-1.6 1.2-3.5 1.9-5.4 1.9-.3 0-.7 0-1-.1 2 1.3 4.3 2 6.7 2 8.1 0 12.5-6.7 12.5-12.5v-.6c.8-.6 1.6-1.3 2.2-2.2"}))}var $r=Y.forwardRef(Hr),de=$r;var w=l(N(),1);var me=l(le(),1);function Vr({license:e,preamble:r="",className:n}){var t;let o=/^([CBYSAND0-]+)(?:(?:-)([0-9].[0-9]))?$/.exec(e.id);if(!e.CC||!o)return null;let a=`${r}${(t=e.name)!==null&&t!==void 0?t:e.title} (${e.id})`,i=o[1].toUpperCase();return(0,w.jsxs)("a",{href:e.url,target:"_blank",rel:"noopener noreferrer",className:(0,me.default)("opacity-50 hover:opacity-100 text-inherit hover:text-inherit",n),"aria-label":a,children:[(0,w.jsx)(Ce,{width:"1.25rem",height:"1.25rem",className:"inline-block mx-1",title:a}),(i.startsWith("CC0")||i.startsWith("CC-0")||i.includes("ZERO"))&&(0,w.jsx)(ye,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"CC0: Work is in the worldwide public domain"}),i.includes("BY")&&(0,w.jsx)(xe,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"Credit must be given to the creator"}),i.includes("NC")&&(0,w.jsx)(we,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"Only noncommercial uses of the work are permitted"}),i.includes("SA")&&(0,w.jsx)(Re,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"Adaptations must be shared under the same terms"}),i.includes("ND")&&(0,w.jsx)(be,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"No derivatives or adaptations of the work are permitted"})]})}function Me({license:e,preamble:r="",className:n}){var t;if(!e)return null;let o=typeof e=="string"?{name:"",url:"",id:e}:e;return!o||Object.keys(o).length===0?null:o.CC?(0,w.jsx)(Vr,{license:o,preamble:r,className:n}):(0,w.jsxs)("a",{href:o.url||void 0,target:"_blank",rel:"noopener noreferrer",title:`${r}${(t=o.name)!==null&&t!==void 0?t:o.title} (${o.id})`,className:"text-inherit hover:text-inherit",children:[!o.osi&&(0,w.jsx)(et,{width:"1.25rem",height:"1.25rem",className:(0,me.default)("mx-1 inline-block opacity-60 hover:opacity-100",n)}),o.osi&&(0,w.jsx)(Ae,{width:"1.25rem",height:"1.25rem",className:(0,me.default)("mx-1 inline-block opacity-60 hover:opacity-100 hover:text-[#599F46]",n)})]})}function zt({license:e,className:r}){return e?typeof e!="string"&&("code"in e||"content"in e)?(0,w.jsxs)(w.Fragment,{children:[(0,w.jsx)(Me,{license:e.content,preamble:"Content License: ",className:r}),(0,w.jsx)(Me,{license:e.code,preamble:"Code License: ",className:r})]}):(0,w.jsx)(Me,{license:e,className:r}):null}var R=l(N(),1);var Ie=l(le(),1),Lt=l(b(),1),Bt=function(e,r,n,t){function o(a){return a instanceof n?a:new n(function(i){i(a)})}return new(n||(n=Promise))(function(a,i){function s(f){try{g(t.next(f))}catch(x){i(x)}}function h(f){try{g(t.throw(f))}catch(x){i(x)}}function g(f){f.done?a(f.value):o(f.value).then(s,h)}g((t=t.apply(e,r||[])).next())})};function Ur(e,r){return Bt(this,void 0,void 0,function*(){let t=yield(yield fetch(e)).blob();return Gr(t,r)})}function Gr(e,r){return Bt(this,void 0,void 0,function*(){if(window.navigator&&window.navigator.msSaveOrOpenBlob)return window.navigator.msSaveOrOpenBlob(e);let n=URL.createObjectURL(e),t=document.createElement("a");return t.href=n,t.download=r,t.style.display="none",t.dispatchEvent(new MouseEvent("click",{bubbles:!0,cancelable:!0,view:window})),setTimeout(()=>{URL.revokeObjectURL(n),t.remove()},100),!0})}var Se="self-center flex-none inline-block mr-3";function Kr({url:e,filename:r,format:n,className:t,title:o,internal:a}){if(!r){let s=a?(0,R.jsx)(Xe,{width:"1.25rem",height:"1.25rem",className:Se,"aria-hidden":"true"}):(0,R.jsx)(qe,{width:"1.25rem",height:"1.25rem",className:Se,"aria-hidden":"true"});return(0,R.jsxs)("a",{className:(0,Ie.default)(t,"flex no-underline"),href:e,target:a?void 0:"_blank",rel:a?void 0:"noreferrer noopener",children:[(0,R.jsxs)("span",{className:"sr-only",children:["Visit URL ",o!=null?o:""]}),s,(0,R.jsx)("span",{className:"w-max max-w-[200px] self-center",children:o!=null?o:e})]})}let i=(0,Lt.useCallback)(s=>{s.preventDefault(),Ur(e,r)},[e,r]);return(0,R.jsxs)("a",{className:(0,Ie.default)(t,"flex no-underline"),href:e,onClick:i,children:[(0,R.jsxs)("span",{className:"sr-only",children:["Download",n?` as ${n}`:""," ",o!=null?o:""]}),(0,R.jsx)(Qe,{width:"1.25rem",height:"1.25rem",className:Se,"aria-hidden":"true"}),(0,R.jsx)("span",{className:"w-max max-w-[200px] self-center",children:o!=null?o:r})]})}function Ht({exports:e}){return!e||e.length===0?null:(0,R.jsxs)(oe,{as:"div",className:"relative flex inline-block mx-1 grow-0",children:[(0,R.jsxs)(oe.Button,{className:"relative ml-2 -mr-1",children:[(0,R.jsx)("span",{className:"sr-only",children:"Downloads"}),(0,R.jsx)(Ze,{width:"1.25rem",height:"1.25rem","aria-hidden":"true"})]}),(0,R.jsx)(oe.Items,{className:"absolute z-10 overflow-hidden bg-white rounded-sm shadow-lg -right-1 dark:bg-slate-800 ring-1 ring-black ring-opacity-5 focus:outline-none",children:e.map((r,n)=>(0,R.jsx)(oe.Item,{children:(0,R.jsx)(Kr,{className:"block p-3 no-underline hover:bg-stone-700 dark:hover:bg-stone-200 hover:text-white dark:hover:text-black",url:r.url,filename:r.filename,format:r.format,title:r.title,internal:r.internal})},n))})]})}var m=l(N(),1),ar=l(b(),1),Fe=l(le(),1);var d=l(N(),1);var p=l(b(),1);var v=l(N(),1),De="Popover",[$t,fn]=tt(De,[he]),ne=he(),[Wr,M]=$t(De),Vt=e=>{let{__scopePopover:r,children:n,open:t,defaultOpen:o,onOpenChange:a,modal:i=!1}=e,s=ne(r),h=p.useRef(null),[g,f]=p.useState(!1),[x=!1,C]=rt({prop:t,defaultProp:o,onChange:a});return(0,v.jsx)(nt,{...s,children:(0,v.jsx)(Wr,{scope:r,contentId:wt(),triggerRef:h,open:x,onOpenChange:C,onOpenToggle:p.useCallback(()=>C(P=>!P),[C]),hasCustomAnchor:g,onCustomAnchorAdd:p.useCallback(()=>f(!0),[]),onCustomAnchorRemove:p.useCallback(()=>f(!1),[]),modal:i,children:n})})};Vt.displayName=De;var Ut="PopoverAnchor",Jr=p.forwardRef((e,r)=>{let{__scopePopover:n,...t}=e,o=M(Ut,n),a=ne(n),{onCustomAnchorAdd:i,onCustomAnchorRemove:s}=o;return p.useEffect(()=>(i(),()=>s()),[i,s]),(0,v.jsx)(ge,{...a,...t,ref:r})});Jr.displayName=Ut;var Gt="PopoverTrigger",Kt=p.forwardRef((e,r)=>{let{__scopePopover:n,...t}=e,o=M(Gt,n),a=ne(n),i=fe(r,o.triggerRef),s=(0,v.jsx)(ue.button,{type:"button","aria-haspopup":"dialog","aria-expanded":o.open,"aria-controls":o.contentId,"data-state":Qt(o.open),...t,ref:i,onClick:z(e.onClick,o.onOpenToggle)});return o.hasCustomAnchor?s:(0,v.jsx)(ge,{asChild:!0,...a,children:s})});Kt.displayName=Gt;var je="PopoverPortal",[Yr,Zr]=$t(je,{forceMount:void 0}),Wt=e=>{let{__scopePopover:r,forceMount:n,children:t,container:o}=e,a=M(je,r);return(0,v.jsx)(Yr,{scope:r,forceMount:n,children:(0,v.jsx)(ve,{present:n||a.open,children:(0,v.jsx)(ct,{asChild:!0,container:o,children:t})})})};Wt.displayName=je;var Z="PopoverContent",Jt=p.forwardRef((e,r)=>{let n=Zr(Z,e.__scopePopover),{forceMount:t=n.forceMount,...o}=e,a=M(Z,e.__scopePopover);return(0,v.jsx)(ve,{present:t||a.open,children:a.modal?(0,v.jsx)(qr,{...o,ref:r}):(0,v.jsx)(Qr,{...o,ref:r})})});Jt.displayName=Z;var qr=p.forwardRef((e,r)=>{let n=M(Z,e.__scopePopover),t=p.useRef(null),o=fe(r,t),a=p.useRef(!1);return p.useEffect(()=>{let i=t.current;if(i)return Ot(i)},[]),(0,v.jsx)(_t,{as:ot,allowPinchZoom:!0,children:(0,v.jsx)(Yt,{...e,ref:o,trapFocus:n.open,disableOutsidePointerEvents:!0,onCloseAutoFocus:z(e.onCloseAutoFocus,i=>{i.preventDefault(),a.current||n.triggerRef.current?.focus()}),onPointerDownOutside:z(e.onPointerDownOutside,i=>{let s=i.detail.originalEvent,h=s.button===0&&s.ctrlKey===!0,g=s.button===2||h;a.current=g},{checkForDefaultPrevented:!1}),onFocusOutside:z(e.onFocusOutside,i=>i.preventDefault(),{checkForDefaultPrevented:!1})})})}),Qr=p.forwardRef((e,r)=>{let n=M(Z,e.__scopePopover),t=p.useRef(!1),o=p.useRef(!1);return(0,v.jsx)(Yt,{...e,ref:r,trapFocus:!1,disableOutsidePointerEvents:!1,onCloseAutoFocus:a=>{e.onCloseAutoFocus?.(a),a.defaultPrevented||(t.current||n.triggerRef.current?.focus(),a.preventDefault()),t.current=!1,o.current=!1},onInteractOutside:a=>{e.onInteractOutside?.(a),a.defaultPrevented||(t.current=!0,a.detail.originalEvent.type==="pointerdown"&&(o.current=!0));let i=a.target;n.triggerRef.current?.contains(i)&&a.preventDefault(),a.detail.originalEvent.type==="focusin"&&o.current&&a.preventDefault()}})}),Yt=p.forwardRef((e,r)=>{let{__scopePopover:n,trapFocus:t,onOpenAutoFocus:o,onCloseAutoFocus:a,disableOutsidePointerEvents:i,onEscapeKeyDown:s,onPointerDownOutside:h,onFocusOutside:g,onInteractOutside:f,...x}=e,C=M(Z,n),P=ne(n);return Nt(),(0,v.jsx)(kt,{asChild:!0,loop:!0,trapped:t,onMountAutoFocus:o,onUnmountAutoFocus:a,children:(0,v.jsx)(st,{asChild:!0,disableOutsidePointerEvents:i,onInteractOutside:f,onEscapeKeyDown:s,onPointerDownOutside:h,onFocusOutside:g,onDismiss:()=>C.onOpenChange(!1),children:(0,v.jsx)(at,{"data-state":Qt(C.open),role:"dialog",id:C.contentId,...P,...x,ref:r,style:{...x.style,"--radix-popover-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-popover-content-available-width":"var(--radix-popper-available-width)","--radix-popover-content-available-height":"var(--radix-popper-available-height)","--radix-popover-trigger-width":"var(--radix-popper-anchor-width)","--radix-popover-trigger-height":"var(--radix-popper-anchor-height)"}})})})}),Zt="PopoverClose",Xr=p.forwardRef((e,r)=>{let{__scopePopover:n,...t}=e,o=M(Zt,n);return(0,v.jsx)(ue.button,{type:"button",...t,ref:r,onClick:z(e.onClick,()=>o.onOpenChange(!1))})});Xr.displayName=Zt;var eo="PopoverArrow",qt=p.forwardRef((e,r)=>{let{__scopePopover:n,...t}=e,o=ne(n);return(0,v.jsx)(it,{...o,...t,ref:r})});qt.displayName=eo;function Qt(e){return e?"open":"closed"}var Xt=Vt;var er=Kt,tr=Wt,rr=Jt;var or=qt;var j=l(N(),1);var ro=function(e,r){var n={};for(var t in e)Object.prototype.hasOwnProperty.call(e,t)&&r.indexOf(t)<0&&(n[t]=e[t]);if(e!=null&&typeof Object.getOwnPropertySymbols=="function")for(var o=0,t=Object.getOwnPropertySymbols(e);o{var{id:s}=i,h=ro(i,["id"]);return[s,h]}))!==null&&n!==void 0?n:[])[r])!==null&&t!==void 0?t:{name:r};return(0,j.jsxs)(j.Fragment,{children:[a.name||a.institution," ",a.ror&&(0,j.jsx)("a",{className:"ml-1",href:`https://ror.org/${a.ror.replace(/(https?:\/\/)?ror\.org\//,"")}`,target:"_blank",rel:"noopener noreferrer",title:"Research Organization Registry",children:(0,j.jsx)(Ee,{width:"1rem",height:"1rem",className:"inline-block"})})]})}function q({title:e,children:r}){return(0,d.jsxs)("div",{className:"px-4 py-2 sm:grid sm:grid-cols-3 sm:gap-4 sm:px-0",children:[(0,d.jsx)("dt",{className:"text-sm font-medium leading-6 text-gray-900",children:e}),(0,d.jsx)("dd",{className:"mt-1 text-sm leading-6 text-gray-700 sm:col-span-2 sm:mt-0",children:r})]})}var nr=({author:e,affiliations:r,children:n})=>{var t;return e?(0,d.jsxs)(Xt,{children:[(0,d.jsx)(er,{asChild:!0,children:(0,d.jsx)("button",{className:"focus:shadow-[0_0_0_2px] focus:shadow-black outline-none hover:underline","aria-label":"Author Details",children:n})}),(0,d.jsx)(tr,{children:(0,d.jsxs)(rr,{className:"hover-card-content rounded p-5 w-[400px] bg-white shadow",sideOffset:5,children:[(0,d.jsxs)("div",{className:"flex flex-col gap-2.5",children:[(0,d.jsx)("p",{className:"text-mauve12 text-[15px] leading-[19px] font-medium mb-2.5",children:e.name}),(0,d.jsx)("p",{className:"text-mauve12 text-[15px] leading-[19px] font-medium mb-2.5",children:(t=e.affiliations)===null||t===void 0?void 0:t.map(o=>(0,d.jsx)(pe,{affiliations:r,affiliationId:o},o))}),(0,d.jsxs)("dl",{className:"divide-y divide-gray-100",children:[e.email&&(0,d.jsx)(q,{title:"Email",children:(0,d.jsx)("a",{className:"ml-1",href:`mailto:${e.email}`,title:`${e.name} <${e.email}>`,target:"_blank",rel:"noopener noreferrer",children:e.email})}),e.orcid&&(0,d.jsx)(q,{title:"ORCID",children:(0,d.jsx)("a",{className:"ml-1",href:`https://orcid.org/${e.orcid}`,target:"_blank",rel:"noopener noreferrer",title:"ORCID (Open Researcher and Contributor ID)",children:e.orcid})}),e.github&&(0,d.jsx)(q,{title:"GitHub",children:(0,d.jsxs)("a",{className:"ml-1",href:`https://github.com/${e.github}`,target:"_blank",rel:"noopener noreferrer",title:`GitHub: ${e.github}`,children:["@",e.github]})}),e.twitter&&(0,d.jsx)(q,{title:"Twitter",children:(0,d.jsxs)("a",{className:"ml-1",href:`https://twitter.com/${e.twitter}`,target:"_blank",rel:"noopener noreferrer",title:`Twitter: ${e.twitter}`,children:["@",e.twitter]})}),e.url&&(0,d.jsx)(q,{title:"Website",children:(0,d.jsx)("a",{className:"ml-1",href:e.url,target:"_blank",rel:"noopener noreferrer",title:"Author Website",children:e.url})}),e.roles&&(0,d.jsx)(q,{title:"Roles",children:e.roles.join(", ")})]})]}),(0,d.jsx)(or,{className:"fill-white"})]})})]}):(0,d.jsx)(d.Fragment,{children:n})};function ir({author:e,affiliations:r,className:n}){return(0,m.jsxs)("span",{className:(0,Fe.default)("font-semibold text-sm",n),children:[(0,m.jsx)(nr,{author:e,affiliations:r,children:e.name}),e.email&&e.corresponding&&(0,m.jsx)("a",{className:"ml-1",href:`mailto:${e.email}`,title:`${e.name} <${e.email}>`,target:"_blank",rel:"noopener noreferrer",children:(0,m.jsx)(Pe,{width:"1rem",height:"1rem",className:"inline-block text-gray-400 hover:text-blue-400 -translate-y-[0.1em]"})}),e.orcid&&(0,m.jsx)("a",{className:"ml-1",href:`https://orcid.org/${e.orcid}`,target:"_blank",rel:"noopener noreferrer",title:"ORCID (Open Researcher and Contributor ID)",children:(0,m.jsx)(_e,{width:"1rem",height:"1rem",className:"inline-block text-gray-400 hover:text-[#A9C751] -translate-y-[0.1em]"})}),e.twitter&&(0,m.jsx)("a",{className:"ml-1",href:`https://twitter.com/${e.twitter}`,target:"_blank",rel:"noopener noreferrer",title:`Twitter: ${e.twitter}`,children:(0,m.jsx)(de,{width:"1rem",height:"1rem",className:"inline-block text-gray-400 hover:text-[#1DA1F2] -translate-y-[0.1em]"})})]})}function Te({authors:e,affiliations:r}){return!e||e.length===0?null:(0,m.jsx)("div",{children:e.map((n,t)=>(0,m.jsx)(ir,{author:n,affiliations:r,className:(0,Fe.default)("inline-block",{"text-comma":tt||!!o&&(o==null?void 0:o.length)>0,!1)?(0,m.jsx)("header",{className:"mt-4 not-prose",children:(0,m.jsxs)("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-y-1",children:[e.length>1&&(0,m.jsxs)(m.Fragment,{children:[(0,m.jsx)("div",{className:"pb-2 text-xs font-thin uppercase",children:"Authors"}),(0,m.jsx)("div",{className:"pb-2 text-xs font-thin uppercase",children:"Affiliations"})]}),e.map(t=>{var o;return(0,m.jsxs)(ar.default.Fragment,{children:[(0,m.jsx)("div",{children:(0,m.jsx)(ir,{author:t,affiliations:r})}),(0,m.jsx)("div",{className:"text-sm",children:(o=t.affiliations)===null||o===void 0?void 0:o.map(a=>(0,m.jsx)("div",{children:(0,m.jsx)(pe,{affiliations:r,affiliationId:a})},a))})]},t.name)})]})}):(0,m.jsx)("header",{className:"mt-4 not-prose",children:(0,m.jsx)(Te,{authors:e,affiliations:r})})}function oo({to:e,className:r,title:n,children:t}){return(0,c.jsx)("a",{href:e,className:r,title:n,children:t})}function no({doi:e,className:r}){if(!e)return null;let t=`https://doi.org/${e.replace(/^(https?:\/\/)?(dx\.)?doi\.org\//,"")}`;return(0,c.jsx)("div",{className:(0,Q.default)("flex-none",r),title:"DOI (Digital Object Identifier)",children:(0,c.jsx)("a",{className:"font-light no-underline hover:font-light hover:underline text-inherit hover:text-inherit",target:"_blank",rel:"noopener noreferrer",href:t,children:t})})}function ao({date:e,format:r={year:"numeric",month:"long",day:"numeric"},spacer:n}){if(!e)return null;let t=new Date(e),a=new Date(t.getUTCFullYear(),t.getUTCMonth(),t.getUTCDate()).toLocaleDateString("en-US",r);return(0,c.jsx)("time",{dateTime:e,className:(0,Q.default)({"text-spacer":n}),children:a})}function io({github:e}){if(!e)return null;let r=e.replace(/^(https?:\/\/)?github\.com\//,"");return(0,c.jsx)("a",{href:`https://github.com/${r}`,title:`GitHub Repository: ${r}`,target:"_blank",rel:"noopener noreferrer",className:"text-inherit hover:text-inherit",children:(0,c.jsx)(Ne,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1 opacity-60 hover:opacity-100"})})}function co({open_access:e}){return e?(0,c.jsx)("a",{href:"https://en.wikipedia.org/wiki/Open_access",target:"_blank",rel:"noopener noreferrer",title:"Open Access",className:"text-inherit hover:text-inherit",children:(0,c.jsx)(Oe,{width:"1.25rem",height:"1.25rem",className:"mr-1 inline-block opacity-60 hover:opacity-100 hover:text-[#E18435]"})}):null}function so({venue:e,biblio:r,className:n}){if(!e)return null;let{title:t,url:o}=typeof e=="string"?{title:e,url:null}:e;if(!t)return null;let{volume:a,issue:i}=r!=null?r:{};return(0,c.jsxs)("div",{className:(0,Q.default)("flex-none mr-2",n),children:[o?(0,c.jsx)(oo,{className:"font-semibold no-underline smallcaps",to:o,title:t,children:t}):(0,c.jsx)("span",{className:"font-semibold smallcaps",children:t}),a!=null&&(0,c.jsxs)("span",{className:"pl-2 ml-2 border-l",children:["Volume ",a,i!=null&&(0,c.jsxs)(c.Fragment,{children:[", Issue ",i]})]})]})}function sr({frontmatter:e,kind:r=T.Article,authorStyle:n="block",hideBadges:t,hideExports:o,className:a}){if(!e)return null;let{title:i,subtitle:s,subject:h,doi:g,open_access:f,license:x,github:C,venue:P,biblio:S,exports:O,downloads:k,date:A,authors:X}=e,ee=r===T.Notebook,te=k?k.length>0:O&&O.length>0,F=X&&X.length>0,ae=!!f||!!x||!!te||!!ee||!!C,ie=!!h||!!P||!!S,re=!!g||!!A,ze=ie||ae&&!t||te&&!o;return!i&&!s&&!ze&&!F&&!re?null:(0,c.jsxs)("div",{id:"skip-to-frontmatter","aria-label":"article frontmatter",className:(0,Q.default)(a),children:[ze&&(0,c.jsxs)("div",{className:"flex items-center h-6 mb-5 text-sm font-light",children:[h&&(0,c.jsx)("div",{className:(0,Q.default)("flex-none pr-2 smallcaps",{"border-r mr-2":P}),children:h}),(0,c.jsx)(so,{venue:P,biblio:S}),(0,c.jsx)("div",{className:"flex-grow"}),!t&&(0,c.jsxs)(c.Fragment,{children:[(0,c.jsx)(zt,{license:x}),(0,c.jsx)(co,{open_access:f}),(0,c.jsx)(io,{github:C}),ee&&(0,c.jsx)("div",{className:"inline-block mr-1",children:(0,c.jsx)(ke,{width:"1.25rem",height:"1.25rem",className:"inline-block",title:"Jupyter Notebook"})})]}),!o&&(0,c.jsx)(Ht,{exports:k!=null?k:O})]}),i&&(0,c.jsx)("h1",{className:"mb-0",children:i}),s&&(0,c.jsx)("p",{className:"mt-2 mb-0 lead text-zinc-600 dark:text-zinc-400",children:s}),F&&n==="list"&&(0,c.jsx)(Te,{authors:e.authors,affiliations:e.affiliations}),F&&n==="block"&&(0,c.jsx)(cr,{authors:e.authors,affiliations:e.affiliations}),re&&(0,c.jsxs)("div",{className:"flex mt-2 text-sm font-light",children:[(0,c.jsx)(ao,{date:A,spacer:!!g}),(0,c.jsx)(no,{doi:g})]})]})}var pr=l(ur());var _=l(N(),1);function lo({size:e=24,fill:r="#616161",highlight:n="#F37726",className:t}){return(0,_.jsx)("svg",{style:{width:e,height:e},xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 100 100",stroke:"none",className:t,children:(0,_.jsxs)("g",{id:"icon",children:[(0,_.jsx)("path",{fill:r,d:`M23.8,54.8v-3.6l4.7-0.8V17.5l-4.7-0.8V13H36l13.4,31.7h0.2l13-31.7h12.6v3.6l-4.7,0.8v32.9l4.7,0.8v3.6h-15 - v-3.6l4.9-0.8V20.8H65L51.4,53.3h-3.8l-14-32.5h-0.1l0.2,17.4v12.1l5,0.8v3.6H23.8z`}),(0,_.jsx)("path",{fill:n,d:`M47,86.9c0-5.9-3.4-8.8-10.1-8.8h-8.4c-5.2,0-9.4-1.3-12.5-3.8c-3.1-2.5-5.4-6.2-6.8-11l4.8-1.6 - c1.8,5.6,6.4,8.6,13.8,8.8h9.2c6.4,0,10.8,2.5,13.1,7.5c2.3-5,6.7-7.5,13.1-7.5h8.4c7.8,0,12.7-2.9,14.6-8.7l4.8,1.6 - c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8`})]})})}function lr({url:e="https://mystmd.org/made-with-myst"}){return(0,_.jsxs)("a",{className:"flex mx-auto text-gray-700 w-fit hover:text-blue-700 dark:text-gray-200 dark:hover:text-blue-400",href:e,target:"_blank",rel:"noreferrer",children:[(0,_.jsx)(lo,{fill:"currentColor"}),(0,_.jsx)("span",{className:"self-center ml-2 text-sm",children:"Made with MyST"})]})}var dr=l(b());var y=l(N());function mo(e,r){var n;return r.downloads?r.downloads:e?[...(n=r.exports)!=null?n:[],...e]:r.exports}var mr=dr.default.memo(function({article:e,hide_all_footer_links:r,hideKeywords:n}){var A,X,ee,te,F,ae,ie,re;let t=Ye(),o=pt(),a=ce(),i=(X=(A=e.frontmatter)==null?void 0:A.site)!=null?X:{},s=(te=(ee=se())==null?void 0:ee.options)!=null?te:{},{hide_title_block:h,hide_footer_links:g,hide_outline:f,outline_maxdepth:x}={...s,...i},C=mo(t==null?void 0:t.downloads,e.frontmatter),P=$e(e.mdast),S=(ae=(F=e.frontmatter)==null?void 0:F.keywords)!=null?ae:[],O=vt(P,(ie=e.frontmatter)==null?void 0:ie.parts),k=Ve("(min-width: 1024px)");return(0,y.jsx)(Ue,{references:{...e.references,article:e.mdast},frontmatter:e.frontmatter,children:(0,y.jsx)(dt,{children:(0,y.jsxs)(lt,{enable:(re=o==null?void 0:o.enabled)!=null?re:!1,contents:e,children:[!h&&(0,y.jsx)(sr,{kind:e.kind,frontmatter:{...e.frontmatter,downloads:C},className:"mb-8 pt-9"}),!f&&(0,y.jsx)("div",{className:"block my-10 lg:sticky lg:z-10 lg:h-0 lg:pt-0 lg:my-0 lg:ml-10 lg:col-margin-right",style:{top:a},children:(0,y.jsx)(Rt,{className:"relative mt-9",maxdepth:x,isMargin:k})}),(o==null?void 0:o.enabled)&&o.features.notebookCompute&&e.kind===T.Notebook&&(0,y.jsx)(gt,{showLaunch:!0}),(o==null?void 0:o.enabled)&&e.kind===T.Article&&(0,y.jsx)(ut,{pageSlug:e.slug}),(0,y.jsx)("div",{id:"skip-to-article"}),(0,y.jsx)(St,{parts:O,keywords:S,hideKeywords:n}),(0,y.jsx)(xt,{pageKind:e.kind,mdast:P}),(0,y.jsx)(It,{parts:O}),(0,y.jsx)(Pt,{}),(0,y.jsx)(Ct,{}),(0,y.jsx)(ht,{}),!g&&!r&&(0,y.jsx)(yt,{links:e.footer})]})})})});var u=l(N()),Yn=({data:e,matches:r,location:n})=>{var s,h,g,f,x,C,P,S,O,k,A;if(!e)return[];let t=e.config,o=e.project,a=e.page.frontmatter,i=(h=(s=t==null?void 0:t.title)!=null?s:o==null?void 0:o.title)!=null?h:"";return Ft({origin:"",url:n.pathname,title:a!=null&&a.title?`${a.title}${i?` - ${i}`:""}`:i,description:(x=(f=(g=a==null?void 0:a.description)!=null?g:o==null?void 0:o.description)!=null?f:t==null?void 0:t.description)!=null?x:void 0,image:(P=(C=(a==null?void 0:a.thumbnailOptimized)||(a==null?void 0:a.thumbnail))!=null?C:(o==null?void 0:o.thumbnailOptimized)||(o==null?void 0:o.thumbnail))!=null?P:void 0,twitter:(S=t==null?void 0:t.options)==null?void 0:S.twitter,keywords:(A=(k=(O=a==null?void 0:a.keywords)!=null?O:o==null?void 0:o.keywords)!=null?k:t==null?void 0:t.keywords)!=null?A:[]})},Zn=()=>[Tt];function po({children:e,hide_toc:r,hideSearch:n,projectSlug:t,inset:o=20}){let a=ce(),{container:i,toc:s}=Et(a,o);return(0,u.jsxs)(u.Fragment,{children:[(0,u.jsx)(At,{hideToc:r,hideSearch:n}),(0,u.jsx)(Mt,{sidebarRef:s,hide_toc:r,footer:(0,u.jsx)(lr,{}),projectSlug:t}),(0,u.jsx)(We,{children:(0,u.jsx)("article",{ref:i,className:"article content article-grid grid-gap",children:e})})]})}function fr({children:e,hide_toc:r,hideSearch:n,projectSlug:t,inset:o=20}){return(0,u.jsx)(Ke,{children:(0,u.jsx)(po,{children:e,hide_toc:r,hideSearch:n,projectSlug:t,inset:o})})}function fo(){var h,g,f,x;let{container:e}=bt(),r=He(),n=Ge(),t=(g=(h=r.page.frontmatter)==null?void 0:h.site)!=null?g:{},o=(x=(f=se())==null?void 0:f.options)!=null?x:{},{hide_toc:a,hide_search:i,hide_footer_links:s}={...o,...t};return(0,u.jsx)(fr,{hide_toc:a,hideSearch:i,projectSlug:r.page.project,children:(0,u.jsx)(Je,{children:(0,u.jsx)(mt,{features:{notebookCompute:!0,figureCompute:!0,launchBinder:!1},children:(0,u.jsx)(ft,{baseurl:n,children:(0,u.jsx)("main",{ref:e,className:"article-grid subgrid-gap col-screen",children:(0,u.jsx)(mr,{article:r.page,hide_all_footer_links:s})})})})})})}function qn(){let e=Le();return(0,u.jsx)(fr,{children:(0,u.jsx)("main",{className:"article",children:Be(e)?(0,u.jsx)(Dt,{}):(0,u.jsx)(jt,{error:e})})})}export{Tt as a,Yn as b,Zn as c,fo as d,qn as e}; diff --git a/build/_shared/chunk-AC25E3GK.js b/build/_shared/chunk-AC25E3GK.js new file mode 100644 index 0000000..5ac61fd --- /dev/null +++ b/build/_shared/chunk-AC25E3GK.js @@ -0,0 +1,4 @@ +import{A as ct,B as st,C as ve,D as lt,E as dt,F as mt,G as ft,H as pt,I as ut,J as ht,K as gt,L as vt,M as xt,N as wt,O as bt,P as Rt,Q as yt,R as Ct,T as Pt,U as Nt,V as oe,W as kt,X as Ot,Y as _t,Z as At,a as hr,aa as Et,b as Ue,ba as Mt,c as ce,ca as St,d as Ge,da as It,e as Ke,ea as Dt,f as We,g as se,ga as jt,ha as Ft,i as Je,j as Ye,ja as Tt,k as Ze,l as qe,m as Qe,n as Xe,o as et,p as tt,q as z,r as rt,s as ot,t as pe,u as nt,v as ue,w as he,x as at,y as ge,z as it}from"/build/_shared/chunk-P4DJOY6Q.js";import{a as T}from"/build/_shared/chunk-OCWQY3HK.js";import{a as le}from"/build/_shared/chunk-3CVK3PYF.js";import{k as Ve}from"/build/_shared/chunk-IQBJE7PC.js";import{a as Le,e as He,l as $e}from"/build/_shared/chunk-OCTKKCIL.js";import{a as b,d as N}from"/build/_shared/chunk-UAI5KRM7.js";import{e as l}from"/build/_shared/chunk-2NH4LW52.js";var zt={rel:"stylesheet",href:"https://cdn.jsdelivr.net/npm/katex@0.15.2/dist/katex.min.css",integrity:"sha384-MlJdn/WNKDGXveldHDdyRP1R4CTHr3FeuDNfhsLPYrq2t0UBkUdK2jyTnXPEK1NQ",crossOrigin:"anonymous"};var c=l(N(),1),Q=l(le(),1);var B=l(b(),1);function gr({title:e,titleId:t,...n},r){return B.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?B.createElement("title",{id:t},e):null,B.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.8c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9-1.8-1.7-2.8-4-2.8-6.7s1-5 2.9-6.9C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.3C4.8 8 4 9.9 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4 1.5-1.5 2.3-3.3 2.3-5.6 0-2.2-.8-4.1-2.3-5.7C16.1 4.8 14.2 4 12 4zm2.6 5.6v4h-1.1v4.7h-3v-4.7H9.4v-4c0-.2.1-.3.2-.4.1-.2.2-.2.4-.2h4c.2 0 .3.1.4.2.2.1.2.2.2.4zm-4-2.5c0-.9.5-1.4 1.4-1.4s1.4.5 1.4 1.4c0 .9-.5 1.4-1.4 1.4s-1.4-.5-1.4-1.4z"}))}var vr=B.forwardRef(gr),xe=vr;var L=l(b(),1);function xr({title:e,titleId:t,...n},r){return L.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?L.createElement("title",{id:t},e):null,L.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.8c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9-1.9-1.9-2.9-4.2-2.9-6.9s1-5 2.9-6.9c2-1.7 4.3-2.7 7-2.7zM4.4 9.4C4.2 10.2 4 11 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4.6-.5 1-1.1 1.3-1.7l-3.7-1.6c-.1.6-.4 1.1-.9 1.5-.5.4-1.1.6-1.8.7V18h-1.1v-1.5c-1.1 0-2.1-.4-3-1.2l1.3-1.4c.6.6 1.4.9 2.2.9.3 0 .6-.1.9-.2.2-.2.4-.4.4-.7 0-.2-.1-.4-.3-.6l-.9-.4-1.1-.6-1.5-.7-5.1-2.2zM12 4c-2.2 0-4.1.8-5.6 2.3-.4.4-.7.9-1.1 1.3L9 9.3c.2-.5.5-.9 1-1.2.5-.3 1-.5 1.6-.5V6.1h1.1v1.5c.9 0 1.7.3 2.4.9l-1.3 1.3c-.5-.4-1.1-.6-1.7-.6-.3 0-.6.1-.8.2-.2.1-.3.3-.3.6 0 .1 0 .2.1.2l1.2.6.9.4 1.6.7 5 2.2c.2-.7.2-1.4.2-2.1 0-2.2-.8-4.1-2.3-5.7C16.1 4.8 14.2 4 12 4z"}))}var wr=L.forwardRef(xr),we=wr;var H=l(b(),1);function br({title:e,titleId:t,...n},r){return H.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?H.createElement("title",{id:t},e):null,H.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.9c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9C3.2 17 2.2 14.7 2.2 12s1-5 2.9-6.9C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.4C4.8 8 4 9.9 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4 1.5-1.5 2.3-3.3 2.3-5.6 0-2.2-.8-4.1-2.3-5.6C16.1 4.8 14.2 4 12 4zm3.7 5.7v1.7H8.6V9.7h7.1zm0 3.1v1.7H8.6v-1.7h7.1z"}))}var Rr=H.forwardRef(br),be=Rr;var $=l(b(),1);function yr({title:e,titleId:t,...n},r){return $.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?$.createElement("title",{id:t},e):null,$.createElement("path",{d:"M12 2.2c2.7 0 5 .9 6.9 2.8 1.9 1.9 2.8 4.2 2.8 6.9s-.9 5-2.8 6.9c-2 1.9-4.3 2.9-7 2.9-2.6 0-4.9-1-6.9-2.9C3.2 17 2.2 14.7 2.2 12s1-5 2.9-6.9C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.4C4.8 8 4 9.9 4 12c0 2.2.8 4 2.4 5.6C8 19.2 9.8 20 12 20c2.2 0 4.1-.8 5.7-2.4 1.5-1.5 2.3-3.3 2.3-5.6 0-2.2-.8-4.1-2.3-5.6C16.1 4.8 14.2 4 12 4zm-4.3 6.6c.2-1.2.7-2.1 1.4-2.8.8-.7 1.7-1 2.8-1 1.5 0 2.8.5 3.7 1.5.9 1 1.4 2.3 1.4 3.8s-.5 2.7-1.4 3.7c-.9 1-2.2 1.5-3.7 1.5-1.1 0-2.1-.3-2.9-1-.8-.7-1.3-1.6-1.4-2.8h2.5c.1 1.2.8 1.8 2.1 1.8.7 0 1.2-.3 1.7-.9.4-.6.6-1.4.6-2.4s-.2-1.8-.6-2.4c-.4-.5-.9-.8-1.7-.8-1.3 0-2 .6-2.2 1.7h.7l-1.9 1.9-1.9-1.9.8.1z"}))}var Cr=$.forwardRef(yr),Re=Cr;var I=l(b(),1);function Pr({title:e,titleId:t,...n},r){return I.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?I.createElement("title",{id:t},e):null,I.createElement("path",{d:"M12 6.4c-3.2 0-4 3-4 5.6 0 2.6.8 5.6 4 5.6s4-3 4-5.6c0-2.6-.8-5.6-4-5.6zm0 2.1h.4c.2.2.3.5.1.9l-2.1 3.9c-.1-.5-.1-1-.1-1.4 0-1 0-3.4 1.7-3.4zm1.6 1.8c.1.6.1 1.2.1 1.7 0 1.1-.1 3.5-1.7 3.5h-.4-.1-.1c-.4-.2-.6-.4-.3-.9l2.5-4.3z"}),I.createElement("path",{d:"M12 2.2c-2.7 0-5 .9-6.8 2.8-1 1-1.7 2.1-2.2 3.3-.5 1.2-.8 2.4-.8 3.7 0 1.3.2 2.5.7 3.7.5 1.2 1.2 2.2 2.1 3.2.9.9 2 1.6 3.2 2.1 1.2.5 2.4.7 3.7.7 1.3 0 2.5-.3 3.7-.8 1.2-.5 2.3-1.2 3.2-2.2.9-.9 1.6-1.9 2.1-3.1.5-1.2.7-2.4.7-3.8 0-1.3-.2-2.6-.7-3.7-.3-1-1-2.1-1.9-3-2-1.9-4.3-2.9-7-2.9zM12 4c2.2 0 4.1.8 5.7 2.3.7.8 1.3 1.7 1.7 2.6.4 1 .6 2 .6 3.1 0 2.2-.8 4.1-2.3 5.6-.8.8-1.7 1.4-2.7 1.8-1 .4-2 .6-3 .6-1.1 0-2.1-.2-3-.6-1-.4-1.8-1-2.6-1.7C5.6 16.9 5 16 4.6 15c-.4-1-.6-2-.6-3 0-1.1.2-2.1.6-3 .4-1 1-1.9 1.8-2.6C7.9 4.8 9.8 4 12 4z"}))}var Nr=I.forwardRef(Pr),ye=Nr;var V=l(b(),1);function kr({title:e,titleId:t,...n},r){return V.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?V.createElement("title",{id:t},e):null,V.createElement("path",{d:"M12 2.2c2.7 0 5 1 7 2.9.9.9 1.6 2 2.1 3.1.5 1.2.7 2.4.7 3.8 0 1.3-.2 2.6-.7 3.8-.5 1.2-1.2 2.2-2.1 3.1-1 .9-2 1.7-3.2 2.2-1.2.5-2.5.7-3.7.7s-2.6-.3-3.8-.8c-1.2-.5-2.2-1.2-3.2-2.1s-1.6-2-2.1-3.2-.8-2.4-.8-3.7c0-1.3.2-2.5.7-3.7S4.2 6 5.1 5.1C7 3.2 9.3 2.2 12 2.2zM12 4c-2.2 0-4.1.8-5.6 2.3C5.6 7.1 5 8 4.6 9c-.4 1-.6 2-.6 3s.2 2.1.6 3c.4 1 1 1.8 1.8 2.6S8 19 9 19.4c1 .4 2 .6 3 .6s2.1-.2 3-.6c1-.4 1.9-1 2.7-1.8 1.5-1.5 2.3-3.3 2.3-5.6 0-1.1-.2-2.1-.6-3.1-.4-1-1-1.8-1.7-2.6C16.1 4.8 14.2 4 12 4zm-.1 6.4l-1.3.7c-.1-.3-.3-.5-.5-.6-.2-.1-.4-.2-.6-.2-.9 0-1.3.6-1.3 1.7 0 .5.1.9.3 1.3.2.3.5.5 1 .5.6 0 1-.3 1.2-.8l1.2.6c-.3.5-.6.9-1.1 1.1-.5.3-1 .4-1.5.4-.9 0-1.6-.3-2.1-.8-.5-.6-.8-1.3-.8-2.3 0-.9.3-1.7.8-2.2.6-.6 1.3-.8 2.1-.8 1.2 0 2.1.4 2.6 1.4zm5.6 0l-1.3.7c-.1-.3-.3-.5-.5-.6-.2-.1-.4-.2-.6-.2-.9 0-1.3.6-1.3 1.7 0 .5.1.9.3 1.3.2.3.5.5 1 .5.6 0 1-.3 1.2-.8l1.2.6c-.3.5-.6.9-1.1 1.1-.4.2-.9.3-1.4.3-.9 0-1.6-.3-2.1-.8s-.8-1.3-.8-2.2c0-.9.3-1.7.8-2.2.5-.5 1.2-.8 2-.8 1.2 0 2.1.4 2.6 1.4z"}))}var Or=V.forwardRef(kr),Ce=Or;var D=l(b(),1);function _r({title:e,titleId:t,...n},r){return D.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?D.createElement("title",{id:t},e):null,D.createElement("path",{d:"M21.8 18c0 1.1-.9 2-1.9 2H4.2c-1.1 0-1.9-.9-1.9-2V9.9c0-.5.3-.7.8-.4l7.8 4.7c.7.4 1.7.4 2.4 0L21 9.5c.4-.2.8-.1.8.4V18z"}),D.createElement("path",{d:"M21.8 6c0-1.1-.9-2-1.9-2H4.2c-1.1 0-2 .9-2 2v.4c0 .5.3 1.1.8 1.3l8.5 5.1c.2.1.7.1.9 0l8.6-5c.4-.3.8-.9.8-1.3-.1-.1-.1-.5 0-.5z"}))}var Ar=D.forwardRef(_r),Pe=Ar;var U=l(b(),1);function Er({title:e,titleId:t,...n},r){return U.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?U.createElement("title",{id:t},e):null,U.createElement("path",{d:"M12 2.5c-5.4 0-9.8 4.4-9.8 9.7 0 4.3 2.8 8 6.7 9.2.5.1.7-.2.7-.5v-1.8c-2.4.5-3.1-.6-3.3-1.1-.1-.3-.6-1.1-1-1.4-.3-.2-.8-.6 0-.6s1.3.7 1.5 1c.9 1.5 2.3 1.1 2.8.8.1-.6.3-1.1.6-1.3-2.2-.2-4.4-1.1-4.4-4.8 0-1.1.4-1.9 1-2.6-.1-.2-.4-1.2.1-2.6 0 0 .8-.3 2.7 1 .8-.2 1.6-.3 2.4-.3.8 0 1.7.1 2.4.3 1.9-1.3 2.7-1 2.7-1 .5 1.3.2 2.3.1 2.6.6.7 1 1.5 1 2.6 0 3.7-2.3 4.6-4.4 4.8.4.3.7.9.7 1.8V21c0 .3.2.6.7.5 3.9-1.3 6.6-4.9 6.6-9.2 0-5.4-4.4-9.8-9.8-9.8z"}))}var Mr=U.forwardRef(Er),Ne=Mr;var G=l(b(),1);function Sr({title:e,titleId:t,...n},r){return G.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?G.createElement("title",{id:t},e):null,G.createElement("path",{d:"M20.2 1.7c0 .8-.5 1.4-1.3 1.5-.8 0-1.4-.5-1.5-1.3 0-.8.5-1.4 1.3-1.5.8-.1 1.5.5 1.5 1.3zM12 17.9c-3.7 0-7-1.3-8.7-3.3 1.8 4.8 7.1 7.3 11.9 5.5 2.5-.9 4.5-2.9 5.5-5.5-1.7 2-4.9 3.3-8.7 3.3zM12 5.1c3.7 0 7 1.3 8.7 3.3-1.8-4.8-7.1-7.3-11.9-5.5-2.5.9-4.5 2.9-5.5 5.5 1.7-2 5-3.3 8.7-3.3zM6.9 21.8c.1 1-.7 1.8-1.7 1.9-1 .1-1.8-.7-1.9-1.7 0-1 .7-1.8 1.7-1.9 1-.1 1.8.7 1.9 1.7zM3.7 4.6c-.6 0-1-.4-1-1s.4-1 1-1 1 .4 1 1c0 .5-.4 1-1 1z"}))}var Ir=G.forwardRef(Sr),ke=Ir;var E=l(b(),1);function Dr({title:e,titleId:t,...n},r){return E.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?E.createElement("title",{id:t},e):null,E.createElement("path",{d:"M17.1 12.6h-2V7.5c0-1.7-1.4-3.1-3-3.1-.8 0-1.6.3-2.2.9-.6.5-.9 1.3-.9 2.2v.7H7v-.7c0-1.4.5-2.7 1.5-3.7s2.2-1.5 3.6-1.5 2.6.5 3.6 1.5 1.5 2.3 1.5 3.7v5.1z"}),E.createElement("path",{d:"M12 21.8c-.8 0-1.6-.2-2.3-.5-.7-.3-1.4-.8-1.9-1.3-.6-.6-1-1.2-1.3-2-.3-.8-.5-1.6-.5-2.4s.2-1.6.5-2.4c.3-.7.7-1.4 1.3-2s1.2-1 1.9-1.3c.7-.3 1.5-.5 2.3-.5.8 0 1.6.2 2.3.5.7.3 1.4.8 1.9 1.3.6.6 1 1.2 1.3 2 .3.8.5 1.6.5 2.4s-.2 1.6-.5 2.4c-.3.7-.7 1.4-1.3 2-.6.6-1.2 1-1.9 1.3-.7.3-1.5.5-2.3.5zm0-10.3c-2.2 0-4 1.8-4 4.1s1.8 4.1 4 4.1 4-1.8 4-4.1-1.8-4.1-4-4.1z"}),E.createElement("circle",{cx:12,cy:15.6,r:1.7}))}var jr=E.forwardRef(Dr),Oe=jr;var K=l(b(),1);function Fr({title:e,titleId:t,...n},r){return K.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?K.createElement("title",{id:t},e):null,K.createElement("path",{d:"M21.8 12c0 5.4-4.4 9.8-9.8 9.8S2.2 17.4 2.2 12 6.6 2.2 12 2.2s9.8 4.4 9.8 9.8zM8.2 5.8c-.4 0-.8.3-.8.8s.3.8.8.8.8-.4.8-.8-.3-.8-.8-.8zm2.3 9.6h1.2v-6h1.8c2.3 0 3.3 1.4 3.3 3s-1.5 3-3.3 3h-3v1.1H9V8.3H7.7v8.2h5.9c3.3 0 4.5-2.2 4.5-4.1s-1.2-4.1-4.3-4.1h-3.2l-.1 7.1z"}))}var Tr=K.forwardRef(Fr),_e=Tr;var W=l(b(),1);function zr({title:e,titleId:t,...n},r){return W.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?W.createElement("title",{id:t},e):null,W.createElement("path",{d:"M13.2 15.6c1.4-.5 2.1-1.6 2.1-3.3S13.8 8.9 12 8.9c-1.9 0-3.3 1.6-3.3 3.3 0 1.8.8 3 2.2 3.4l-2.3 5.9c-3.1-.8-6.3-4.6-6.3-9.3 0-5.5 4.3-10 9.7-10s9.8 4.5 9.8 10c0 4.7-3.1 8.5-6.3 9.3l-2.3-5.9z"}))}var Br=W.forwardRef(zr),Ae=Br;var J=l(b(),1);function Lr({title:e,titleId:t,...n},r){return J.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?J.createElement("title",{id:t},e):null,J.createElement("path",{d:"M10 4.2L8.3 6.8 6.6 4.2H10zM17.1 4.2l-1.7 2.6-1.7-2.6h3.4zM6.6 19.8l1.7-2.6 1.7 2.6H6.6zM13.7 19.8l1.7-2.6 1.7 2.6h-3.4zM20.8 12.5c.6-.1 1.1-.4 1.4-.8.3-.4.5-.9.5-1.5 0-.5-.1-.9-.3-1.2-.2-.3-.4-.6-.7-.8-.3-.2-.6-.3-1-.4-.4-.1-.8-.1-1.2-.1h-3.3v2.6c0-.1-.1-.2-.1-.2-.2-.6-.6-1-1-1.4-.4-.4-.9-.7-1.5-.9-.6-.2-1.2-.3-1.9-.3s-1.3.1-1.9.3c-.5.1-1 .4-1.4.8-.3.4-.6.8-.9 1.3 0-.3-.1-.6-.2-.9-.2-.4-.4-.6-.7-.8-.3-.2-.6-.3-1-.4s-.8-.2-1.3-.2H1v8.5h1.9v-3.4h.9l1.8 3.4h2.3l-2.2-3.6c.6-.1 1.1-.4 1.4-.8v-.1.2c0 .7.1 1.3.3 1.8.2.6.6 1 1 1.4.4.4.9.7 1.5.9.6.2 1.2.3 1.9.3s1.3-.1 1.9-.3c.6-.2 1.1-.5 1.5-.9.4-.4.7-.9 1-1.4 0-.1.1-.2.1-.2V16H18v-3.4h.9l1.8 3.4H23l-2.2-3.5zM5.4 10.7c-.1.2-.2.3-.3.3-.2.1-.3.1-.5.1H2.9V9.2h1.7c.2 0 .3.1.5.1.1.1.3.2.3.3.1.1.1.3.1.5.1.3 0 .5-.1.6zm8.8 2.3c-.1.3-.3.6-.5.9-.2.2-.5.4-.8.6-.3.1-.7.2-1.1.2-.4 0-.8-.1-1.1-.2-.3-.1-.6-.3-.8-.6-.2-.2-.4-.5-.5-.9-.1-.3-.2-.7-.2-1.1 0-.4.1-.8.2-1.1s.3-.6.5-.9c.2-.2.5-.4.8-.6.3-.1.7-.2 1.1-.2.4 0 .8.1 1.1.2.3.1.6.3.8.6.2.2.4.5.5.9.1.3.2.7.2 1.1 0 .4 0 .7-.2 1.1zm6.4-2.3c-.1.1-.2.2-.4.3-.2.1-.3.1-.5.1H18V9.2h1.7c.2 0 .3.1.5.1.1.1.3.2.3.3.1.1.1.3.1.5.1.3.1.5 0 .6z"}))}var Hr=J.forwardRef(Lr),Ee=Hr;var Y=l(b(),1);function $r({title:e,titleId:t,...n},r){return Y.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?Y.createElement("title",{id:t},e):null,Y.createElement("path",{d:"M22.7 5.4c-.8.3-1.7.6-2.5.7.9-.5 1.6-1.4 1.9-2.4-.9.5-1.8.9-2.8 1.1-1.7-1.8-4.4-1.9-6.2-.2-1.1 1.1-1.6 2.7-1.3 4.2-3.5-.3-6.8-1.9-9-4.7-.4.7-.6 1.5-.6 2.2 0 1.5.7 2.8 1.9 3.6-.7 0-1.4-.2-2-.5v.1c0 2.1 1.5 3.9 3.5 4.3-.6.2-1.3.2-2 .1.6 1.8 2.2 3 4.1 3-1.6 1.2-3.5 1.9-5.4 1.9-.3 0-.7 0-1-.1 2 1.3 4.3 2 6.7 2 8.1 0 12.5-6.7 12.5-12.5v-.6c.8-.6 1.6-1.3 2.2-2.2"}))}var Vr=Y.forwardRef($r),de=Vr;var w=l(N(),1);var me=l(le(),1);function Ur({license:e,preamble:t="",className:n}){var r;if(!e.id)return null;let o=/^([CBYSAND0-]+)(?:(?:-)([0-9].[0-9]))?$/.exec(e.id);if(!e.CC||!o)return null;let a=`${t}${(r=e.name)!==null&&r!==void 0?r:e.title} (${e.id})`,i=o[1].toUpperCase();return(0,w.jsxs)("a",{href:e.url,target:"_blank",rel:"noopener noreferrer",className:(0,me.default)("opacity-50 hover:opacity-100 text-inherit hover:text-inherit",n),"aria-label":a,children:[(0,w.jsx)(Ce,{width:"1.25rem",height:"1.25rem",className:"inline-block mx-1",title:a}),(i.startsWith("CC0")||i.startsWith("CC-0")||i.includes("ZERO"))&&(0,w.jsx)(ye,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"CC0: Work is in the worldwide public domain"}),i.includes("BY")&&(0,w.jsx)(xe,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"Credit must be given to the creator"}),i.includes("NC")&&(0,w.jsx)(we,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"Only noncommercial uses of the work are permitted"}),i.includes("SA")&&(0,w.jsx)(Re,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"Adaptations must be shared under the same terms"}),i.includes("ND")&&(0,w.jsx)(be,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1",title:"No derivatives or adaptations of the work are permitted"})]})}function Me({license:e,preamble:t="",className:n}){var r;if(!e)return null;let o=typeof e=="string"?{name:"",url:"",id:e}:e;return!o||Object.keys(o).length===0?null:o.CC?(0,w.jsx)(Ur,{license:o,preamble:t,className:n}):(0,w.jsxs)("a",{href:o.url||void 0,target:"_blank",rel:"noopener noreferrer",title:`${t}${(r=o.name)!==null&&r!==void 0?r:o.title} (${o.id})`,className:"text-inherit hover:text-inherit",children:[!o.osi&&(0,w.jsx)(tt,{width:"1.25rem",height:"1.25rem",className:(0,me.default)("mx-1 inline-block opacity-60 hover:opacity-100",n)}),o.osi&&(0,w.jsx)(Ae,{width:"1.25rem",height:"1.25rem",className:(0,me.default)("mx-1 inline-block opacity-60 hover:opacity-100 hover:text-[#599F46]",n)})]})}function Bt({license:e,className:t}){return e?typeof e!="string"&&("code"in e||"content"in e)?(0,w.jsxs)(w.Fragment,{children:[(0,w.jsx)(Me,{license:e.content,preamble:"Content License: ",className:t}),(0,w.jsx)(Me,{license:e.code,preamble:"Code License: ",className:t})]}):(0,w.jsx)(Me,{license:e,className:t}):null}var R=l(N(),1);var Ie=l(le(),1),Ht=l(b(),1),Lt=function(e,t,n,r){function o(a){return a instanceof n?a:new n(function(i){i(a)})}return new(n||(n=Promise))(function(a,i){function s(p){try{g(r.next(p))}catch(x){i(x)}}function h(p){try{g(r.throw(p))}catch(x){i(x)}}function g(p){p.done?a(p.value):o(p.value).then(s,h)}g((r=r.apply(e,t||[])).next())})};function Gr(e,t){return Lt(this,void 0,void 0,function*(){let r=yield(yield fetch(e)).blob();return Kr(r,t)})}function Kr(e,t){return Lt(this,void 0,void 0,function*(){if(window.navigator&&window.navigator.msSaveOrOpenBlob)return window.navigator.msSaveOrOpenBlob(e);let n=URL.createObjectURL(e),r=document.createElement("a");return r.href=n,r.download=t,r.style.display="none",r.dispatchEvent(new MouseEvent("click",{bubbles:!0,cancelable:!0,view:window})),setTimeout(()=>{URL.revokeObjectURL(n),r.remove()},100),!0})}var Se="self-center flex-none inline-block mr-3";function Wr({url:e,filename:t,format:n,className:r,title:o,internal:a}){if(!t){let s=a?(0,R.jsx)(et,{width:"1.25rem",height:"1.25rem",className:Se,"aria-hidden":"true"}):(0,R.jsx)(Qe,{width:"1.25rem",height:"1.25rem",className:Se,"aria-hidden":"true"});return(0,R.jsxs)("a",{className:(0,Ie.default)(r,"flex no-underline"),href:e,target:a?void 0:"_blank",rel:a?void 0:"noreferrer noopener",children:[(0,R.jsxs)("span",{className:"sr-only",children:["Visit URL ",o!=null?o:""]}),s,(0,R.jsx)("span",{className:"w-max max-w-[200px] self-center",children:o!=null?o:e})]})}let i=(0,Ht.useCallback)(s=>{s.preventDefault(),Gr(e,t)},[e,t]);return(0,R.jsxs)("a",{className:(0,Ie.default)(r,"flex no-underline"),href:e,onClick:i,children:[(0,R.jsxs)("span",{className:"sr-only",children:["Download",n?` as ${n}`:""," ",o!=null?o:""]}),(0,R.jsx)(Xe,{width:"1.25rem",height:"1.25rem",className:Se,"aria-hidden":"true"}),(0,R.jsx)("span",{className:"w-max max-w-[200px] self-center",children:o!=null?o:t})]})}function $t({exports:e}){return!e||e.length===0?null:(0,R.jsxs)(oe,{as:"div",className:"relative flex inline-block mx-1 grow-0",children:[(0,R.jsxs)(oe.Button,{className:"relative ml-2 -mr-1",children:[(0,R.jsx)("span",{className:"sr-only",children:"Downloads"}),(0,R.jsx)(qe,{width:"1.25rem",height:"1.25rem","aria-hidden":"true"})]}),(0,R.jsx)(oe.Items,{className:"absolute z-10 overflow-hidden bg-white rounded-sm shadow-lg -right-1 dark:bg-slate-800 ring-1 ring-black ring-opacity-5 focus:outline-none",children:e.map((t,n)=>(0,R.jsx)(oe.Item,{children:(0,R.jsx)(Wr,{className:"block p-3 no-underline hover:bg-stone-700 dark:hover:bg-stone-200 hover:text-white dark:hover:text-black",url:t.url,filename:t.filename,format:t.format,title:t.title,internal:t.internal})},n))})]})}var m=l(N(),1),ir=l(b(),1),Fe=l(le(),1);var d=l(N(),1);var f=l(b(),1);var v=l(N(),1),De="Popover",[Vt,un]=rt(De,[he]),ne=he(),[Jr,M]=Vt(De),Ut=e=>{let{__scopePopover:t,children:n,open:r,defaultOpen:o,onOpenChange:a,modal:i=!1}=e,s=ne(t),h=f.useRef(null),[g,p]=f.useState(!1),[x=!1,C]=ot({prop:r,defaultProp:o,onChange:a});return(0,v.jsx)(at,{...s,children:(0,v.jsx)(Jr,{scope:t,contentId:bt(),triggerRef:h,open:x,onOpenChange:C,onOpenToggle:f.useCallback(()=>C(P=>!P),[C]),hasCustomAnchor:g,onCustomAnchorAdd:f.useCallback(()=>p(!0),[]),onCustomAnchorRemove:f.useCallback(()=>p(!1),[]),modal:i,children:n})})};Ut.displayName=De;var Gt="PopoverAnchor",Yr=f.forwardRef((e,t)=>{let{__scopePopover:n,...r}=e,o=M(Gt,n),a=ne(n),{onCustomAnchorAdd:i,onCustomAnchorRemove:s}=o;return f.useEffect(()=>(i(),()=>s()),[i,s]),(0,v.jsx)(ge,{...a,...r,ref:t})});Yr.displayName=Gt;var Kt="PopoverTrigger",Wt=f.forwardRef((e,t)=>{let{__scopePopover:n,...r}=e,o=M(Kt,n),a=ne(n),i=pe(t,o.triggerRef),s=(0,v.jsx)(ue.button,{type:"button","aria-haspopup":"dialog","aria-expanded":o.open,"aria-controls":o.contentId,"data-state":Xt(o.open),...r,ref:i,onClick:z(e.onClick,o.onOpenToggle)});return o.hasCustomAnchor?s:(0,v.jsx)(ge,{asChild:!0,...a,children:s})});Wt.displayName=Kt;var je="PopoverPortal",[Zr,qr]=Vt(je,{forceMount:void 0}),Jt=e=>{let{__scopePopover:t,forceMount:n,children:r,container:o}=e,a=M(je,t);return(0,v.jsx)(Zr,{scope:t,forceMount:n,children:(0,v.jsx)(ve,{present:n||a.open,children:(0,v.jsx)(st,{asChild:!0,container:o,children:r})})})};Jt.displayName=je;var Z="PopoverContent",Yt=f.forwardRef((e,t)=>{let n=qr(Z,e.__scopePopover),{forceMount:r=n.forceMount,...o}=e,a=M(Z,e.__scopePopover);return(0,v.jsx)(ve,{present:r||a.open,children:a.modal?(0,v.jsx)(Qr,{...o,ref:t}):(0,v.jsx)(Xr,{...o,ref:t})})});Yt.displayName=Z;var Qr=f.forwardRef((e,t)=>{let n=M(Z,e.__scopePopover),r=f.useRef(null),o=pe(t,r),a=f.useRef(!1);return f.useEffect(()=>{let i=r.current;if(i)return _t(i)},[]),(0,v.jsx)(At,{as:nt,allowPinchZoom:!0,children:(0,v.jsx)(Zt,{...e,ref:o,trapFocus:n.open,disableOutsidePointerEvents:!0,onCloseAutoFocus:z(e.onCloseAutoFocus,i=>{i.preventDefault(),a.current||n.triggerRef.current?.focus()}),onPointerDownOutside:z(e.onPointerDownOutside,i=>{let s=i.detail.originalEvent,h=s.button===0&&s.ctrlKey===!0,g=s.button===2||h;a.current=g},{checkForDefaultPrevented:!1}),onFocusOutside:z(e.onFocusOutside,i=>i.preventDefault(),{checkForDefaultPrevented:!1})})})}),Xr=f.forwardRef((e,t)=>{let n=M(Z,e.__scopePopover),r=f.useRef(!1),o=f.useRef(!1);return(0,v.jsx)(Zt,{...e,ref:t,trapFocus:!1,disableOutsidePointerEvents:!1,onCloseAutoFocus:a=>{e.onCloseAutoFocus?.(a),a.defaultPrevented||(r.current||n.triggerRef.current?.focus(),a.preventDefault()),r.current=!1,o.current=!1},onInteractOutside:a=>{e.onInteractOutside?.(a),a.defaultPrevented||(r.current=!0,a.detail.originalEvent.type==="pointerdown"&&(o.current=!0));let i=a.target;n.triggerRef.current?.contains(i)&&a.preventDefault(),a.detail.originalEvent.type==="focusin"&&o.current&&a.preventDefault()}})}),Zt=f.forwardRef((e,t)=>{let{__scopePopover:n,trapFocus:r,onOpenAutoFocus:o,onCloseAutoFocus:a,disableOutsidePointerEvents:i,onEscapeKeyDown:s,onPointerDownOutside:h,onFocusOutside:g,onInteractOutside:p,...x}=e,C=M(Z,n),P=ne(n);return kt(),(0,v.jsx)(Ot,{asChild:!0,loop:!0,trapped:r,onMountAutoFocus:o,onUnmountAutoFocus:a,children:(0,v.jsx)(lt,{asChild:!0,disableOutsidePointerEvents:i,onInteractOutside:p,onEscapeKeyDown:s,onPointerDownOutside:h,onFocusOutside:g,onDismiss:()=>C.onOpenChange(!1),children:(0,v.jsx)(it,{"data-state":Xt(C.open),role:"dialog",id:C.contentId,...P,...x,ref:t,style:{...x.style,"--radix-popover-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-popover-content-available-width":"var(--radix-popper-available-width)","--radix-popover-content-available-height":"var(--radix-popper-available-height)","--radix-popover-trigger-width":"var(--radix-popper-anchor-width)","--radix-popover-trigger-height":"var(--radix-popper-anchor-height)"}})})})}),qt="PopoverClose",eo=f.forwardRef((e,t)=>{let{__scopePopover:n,...r}=e,o=M(qt,n);return(0,v.jsx)(ue.button,{type:"button",...r,ref:t,onClick:z(e.onClick,()=>o.onOpenChange(!1))})});eo.displayName=qt;var to="PopoverArrow",Qt=f.forwardRef((e,t)=>{let{__scopePopover:n,...r}=e,o=ne(n);return(0,v.jsx)(ct,{...o,...r,ref:t})});Qt.displayName=to;function Xt(e){return e?"open":"closed"}var er=Ut;var tr=Wt,rr=Jt,or=Yt;var nr=Qt;var j=l(N(),1);var oo=function(e,t){var n={};for(var r in e)Object.prototype.hasOwnProperty.call(e,r)&&t.indexOf(r)<0&&(n[r]=e[r]);if(e!=null&&typeof Object.getOwnPropertySymbols=="function")for(var o=0,r=Object.getOwnPropertySymbols(e);o{var{id:s}=i,h=oo(i,["id"]);return[s,h]}))!==null&&n!==void 0?n:[])[t])!==null&&r!==void 0?r:{name:t};return(0,j.jsxs)(j.Fragment,{children:[a.name||a.institution," ",a.ror&&(0,j.jsx)("a",{className:"ml-1",href:`https://ror.org/${a.ror.replace(/(https?:\/\/)?ror\.org\//,"")}`,target:"_blank",rel:"noopener noreferrer",title:"Research Organization Registry",children:(0,j.jsx)(Ee,{width:"1rem",height:"1rem",className:"inline-block"})})]})}function q({title:e,children:t}){return(0,d.jsxs)("div",{className:"px-4 py-2 sm:grid sm:grid-cols-3 sm:gap-4 sm:px-0",children:[(0,d.jsx)("dt",{className:"text-sm font-medium leading-6 text-gray-900",children:e}),(0,d.jsx)("dd",{className:"mt-1 text-sm leading-6 text-gray-700 sm:col-span-2 sm:mt-0",children:t})]})}var ar=({author:e,affiliations:t,children:n})=>{var r;return e?(0,d.jsxs)(er,{children:[(0,d.jsx)(tr,{asChild:!0,children:(0,d.jsx)("button",{className:"focus:shadow-[0_0_0_2px] focus:shadow-black outline-none hover:underline","aria-label":"Author Details",children:n})}),(0,d.jsx)(rr,{children:(0,d.jsxs)(or,{className:"hover-card-content rounded p-5 w-[400px] bg-white shadow",sideOffset:5,children:[(0,d.jsxs)("div",{className:"flex flex-col gap-2.5",children:[(0,d.jsx)("p",{className:"text-mauve12 text-[15px] leading-[19px] font-medium mb-2.5",children:e.name}),(0,d.jsx)("p",{className:"text-mauve12 text-[15px] leading-[19px] font-medium mb-2.5",children:(r=e.affiliations)===null||r===void 0?void 0:r.map(o=>(0,d.jsx)(fe,{affiliations:t,affiliationId:o},o))}),(0,d.jsxs)("dl",{className:"divide-y divide-gray-100",children:[e.email&&(0,d.jsx)(q,{title:"Email",children:(0,d.jsx)("a",{className:"ml-1",href:`mailto:${e.email}`,title:`${e.name} <${e.email}>`,target:"_blank",rel:"noopener noreferrer",children:e.email})}),e.orcid&&(0,d.jsx)(q,{title:"ORCID",children:(0,d.jsx)("a",{className:"ml-1",href:`https://orcid.org/${e.orcid}`,target:"_blank",rel:"noopener noreferrer",title:"ORCID (Open Researcher and Contributor ID)",children:e.orcid})}),e.github&&(0,d.jsx)(q,{title:"GitHub",children:(0,d.jsxs)("a",{className:"ml-1",href:`https://github.com/${e.github}`,target:"_blank",rel:"noopener noreferrer",title:`GitHub: ${e.github}`,children:["@",e.github]})}),e.twitter&&(0,d.jsx)(q,{title:"Twitter",children:(0,d.jsxs)("a",{className:"ml-1",href:`https://twitter.com/${e.twitter}`,target:"_blank",rel:"noopener noreferrer",title:`Twitter: ${e.twitter}`,children:["@",e.twitter]})}),e.url&&(0,d.jsx)(q,{title:"Website",children:(0,d.jsx)("a",{className:"ml-1",href:e.url,target:"_blank",rel:"noopener noreferrer",title:"Author Website",children:e.url})}),e.roles&&(0,d.jsx)(q,{title:"Roles",children:e.roles.join(", ")})]})]}),(0,d.jsx)(nr,{className:"fill-white"})]})})]}):(0,d.jsx)(d.Fragment,{children:n})};function cr({author:e,affiliations:t,className:n}){return(0,m.jsxs)("span",{className:(0,Fe.default)("font-semibold text-sm",n),children:[(0,m.jsx)(ar,{author:e,affiliations:t,children:e.name}),e.email&&e.corresponding&&(0,m.jsx)("a",{className:"ml-1",href:`mailto:${e.email}`,title:`${e.name} <${e.email}>`,target:"_blank",rel:"noopener noreferrer",children:(0,m.jsx)(Pe,{width:"1rem",height:"1rem",className:"inline-block text-gray-400 hover:text-blue-400 -translate-y-[0.1em]"})}),e.orcid&&(0,m.jsx)("a",{className:"ml-1",href:`https://orcid.org/${e.orcid}`,target:"_blank",rel:"noopener noreferrer",title:"ORCID (Open Researcher and Contributor ID)",children:(0,m.jsx)(_e,{width:"1rem",height:"1rem",className:"inline-block text-gray-400 hover:text-[#A9C751] -translate-y-[0.1em]"})}),e.twitter&&(0,m.jsx)("a",{className:"ml-1",href:`https://twitter.com/${e.twitter}`,target:"_blank",rel:"noopener noreferrer",title:`Twitter: ${e.twitter}`,children:(0,m.jsx)(de,{width:"1rem",height:"1rem",className:"inline-block text-gray-400 hover:text-[#1DA1F2] -translate-y-[0.1em]"})})]})}function Te({authors:e,affiliations:t}){return!e||e.length===0?null:(0,m.jsx)("div",{children:e.map((n,r)=>(0,m.jsx)(cr,{author:n,affiliations:t,className:(0,Fe.default)("inline-block",{"text-comma":rr||!!o&&(o==null?void 0:o.length)>0,!1)?(0,m.jsx)("header",{className:"mt-4 not-prose",children:(0,m.jsxs)("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-y-1",children:[e.length>1&&(0,m.jsxs)(m.Fragment,{children:[(0,m.jsx)("div",{className:"pb-2 text-xs font-thin uppercase",children:"Authors"}),(0,m.jsx)("div",{className:"pb-2 text-xs font-thin uppercase",children:"Affiliations"})]}),e.map(r=>{var o;return(0,m.jsxs)(ir.default.Fragment,{children:[(0,m.jsx)("div",{children:(0,m.jsx)(cr,{author:r,affiliations:t})}),(0,m.jsx)("div",{className:"text-sm",children:(o=r.affiliations)===null||o===void 0?void 0:o.map(a=>(0,m.jsx)("div",{children:(0,m.jsx)(fe,{affiliations:t,affiliationId:a})},a))})]},r.name)})]})}):(0,m.jsx)("header",{className:"mt-4 not-prose",children:(0,m.jsx)(Te,{authors:e,affiliations:t})})}function no({to:e,className:t,title:n,children:r}){return(0,c.jsx)("a",{href:e,className:t,title:n,children:r})}function ao({doi:e,className:t}){if(!e)return null;let r=`https://doi.org/${e.replace(/^(https?:\/\/)?(dx\.)?doi\.org\//,"")}`;return(0,c.jsx)("div",{className:(0,Q.default)("flex-none",t),title:"DOI (Digital Object Identifier)",children:(0,c.jsx)("a",{className:"font-light no-underline hover:font-light hover:underline text-inherit hover:text-inherit",target:"_blank",rel:"noopener noreferrer",href:r,children:r})})}function io({date:e,format:t={year:"numeric",month:"long",day:"numeric"},spacer:n}){if(!e)return null;let r=new Date(e),a=new Date(r.getUTCFullYear(),r.getUTCMonth(),r.getUTCDate()).toLocaleDateString("en-US",t);return(0,c.jsx)("time",{dateTime:e,className:(0,Q.default)({"text-spacer":n}),children:a})}function co({github:e}){if(!e)return null;let t=e.replace(/^(https?:\/\/)?github\.com\//,"");return(0,c.jsx)("a",{href:`https://github.com/${t}`,title:`GitHub Repository: ${t}`,target:"_blank",rel:"noopener noreferrer",className:"text-inherit hover:text-inherit",children:(0,c.jsx)(Ne,{width:"1.25rem",height:"1.25rem",className:"inline-block mr-1 opacity-60 hover:opacity-100"})})}function so({open_access:e}){return e?(0,c.jsx)("a",{href:"https://en.wikipedia.org/wiki/Open_access",target:"_blank",rel:"noopener noreferrer",title:"Open Access",className:"text-inherit hover:text-inherit",children:(0,c.jsx)(Oe,{width:"1.25rem",height:"1.25rem",className:"mr-1 inline-block opacity-60 hover:opacity-100 hover:text-[#E18435]"})}):null}function lo({venue:e,volume:t,issue:n,className:r}){if(!e)return null;let{title:o,url:a}=typeof e=="string"?{title:e,url:null}:e;return o?(0,c.jsxs)("div",{className:(0,Q.default)("flex-none mr-2",r),children:[a?(0,c.jsx)(no,{className:"font-semibold no-underline smallcaps",to:a,title:o,children:o}):(0,c.jsx)("span",{className:"font-semibold smallcaps",children:o}),t!=null&&(0,c.jsxs)("span",{className:"pl-2 ml-2 border-l",children:["Volume ",t.title,n!=null&&(0,c.jsxs)(c.Fragment,{children:[", Issue ",n.title]})]})]}):null}function lr({frontmatter:e,kind:t=T.Article,authorStyle:n="block",hideBadges:r,hideExports:o,className:a}){if(!e)return null;let{title:i,subtitle:s,subject:h,doi:g,open_access:p,license:x,github:C,venue:P,volume:S,issue:_,exports:A,downloads:k,date:X,authors:ee}=e,te=t===T.Notebook,re=k?k.length>0:A&&A.length>0,F=ee&&ee.length>0,ae=!!p||!!x||!!re||!!te||!!C,ie=!!h||!!P||!!S||!!_,ze=!!g||!!X,Be=ie||ae&&!r||re&&!o;return!i&&!s&&!Be&&!F&&!ze?null:(0,c.jsxs)("div",{id:"skip-to-frontmatter","aria-label":"article frontmatter",className:(0,Q.default)(a),children:[Be&&(0,c.jsxs)("div",{className:"flex items-center h-6 mb-5 text-sm font-light",children:[h&&(0,c.jsx)("div",{className:(0,Q.default)("flex-none pr-2 smallcaps",{"border-r mr-2":P}),children:h}),(0,c.jsx)(lo,{venue:P,volume:S,issue:_}),(0,c.jsx)("div",{className:"flex-grow"}),!r&&(0,c.jsxs)(c.Fragment,{children:[(0,c.jsx)(Bt,{license:x}),(0,c.jsx)(so,{open_access:p}),(0,c.jsx)(co,{github:C}),te&&(0,c.jsx)("div",{className:"inline-block mr-1",children:(0,c.jsx)(ke,{width:"1.25rem",height:"1.25rem",className:"inline-block",title:"Jupyter Notebook"})})]}),!o&&(0,c.jsx)($t,{exports:k!=null?k:A})]}),i&&(0,c.jsx)("h1",{className:"mb-0",children:i}),s&&(0,c.jsx)("p",{className:"mt-2 mb-0 lead text-zinc-600 dark:text-zinc-400",children:s}),F&&n==="list"&&(0,c.jsx)(Te,{authors:e.authors,affiliations:e.affiliations}),F&&n==="block"&&(0,c.jsx)(sr,{authors:e.authors,affiliations:e.affiliations}),ze&&(0,c.jsxs)("div",{className:"flex mt-2 text-sm font-light",children:[(0,c.jsx)(io,{date:X,spacer:!!g}),(0,c.jsx)(ao,{doi:g})]})]})}var pr=l(hr());var O=l(N(),1);function mo({size:e=24,fill:t="#616161",highlight:n="#F37726",className:r}){return(0,O.jsx)("svg",{style:{width:e,height:e},xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 100 100",stroke:"none",className:r,children:(0,O.jsxs)("g",{id:"icon",children:[(0,O.jsx)("path",{fill:t,d:`M23.8,54.8v-3.6l4.7-0.8V17.5l-4.7-0.8V13H36l13.4,31.7h0.2l13-31.7h12.6v3.6l-4.7,0.8v32.9l4.7,0.8v3.6h-15 + v-3.6l4.9-0.8V20.8H65L51.4,53.3h-3.8l-14-32.5h-0.1l0.2,17.4v12.1l5,0.8v3.6H23.8z`}),(0,O.jsx)("path",{fill:n,d:`M47,86.9c0-5.9-3.4-8.8-10.1-8.8h-8.4c-5.2,0-9.4-1.3-12.5-3.8c-3.1-2.5-5.4-6.2-6.8-11l4.8-1.6 + c1.8,5.6,6.4,8.6,13.8,8.8h9.2c6.4,0,10.8,2.5,13.1,7.5c2.3-5,6.7-7.5,13.1-7.5h8.4c7.8,0,12.7-2.9,14.6-8.7l4.8,1.6 + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8`})]})})}function dr({url:e="https://mystmd.org/made-with-myst"}){return(0,O.jsxs)("a",{className:"flex mx-auto text-gray-700 w-fit hover:text-blue-700 dark:text-gray-200 dark:hover:text-blue-400",href:e,target:"_blank",rel:"noreferrer",children:[(0,O.jsx)(mo,{fill:"currentColor"}),(0,O.jsx)("span",{className:"self-center ml-2 text-sm",children:"Made with MyST"})]})}var mr=l(b());var y=l(N());function fo(e,t){var n;return t.downloads?t.downloads:e?[...(n=t.exports)!=null?n:[],...e]:t.exports}var fr=mr.default.memo(function({article:e,hide_all_footer_links:t,hideKeywords:n}){var k,X,ee,te,re,F,ae,ie;let r=Ze(),o=pt(),a=ce(),i=(X=(k=e.frontmatter)==null?void 0:k.site)!=null?X:{},s=(te=(ee=se())==null?void 0:ee.options)!=null?te:{},{hide_title_block:h,hide_footer_links:g,hide_outline:p,outline_maxdepth:x}={...s,...i},C=fo(r==null?void 0:r.downloads,e.frontmatter),P=Ve(e.mdast),S=(F=(re=e.frontmatter)==null?void 0:re.keywords)!=null?F:[],_=xt(P,(ae=e.frontmatter)==null?void 0:ae.parts),A=Ue("(min-width: 1024px)");return(0,y.jsx)(Ge,{references:{...e.references,article:e.mdast},frontmatter:e.frontmatter,children:(0,y.jsx)(mt,{children:(0,y.jsxs)(dt,{enable:(ie=o==null?void 0:o.enabled)!=null?ie:!1,contents:e,children:[!h&&(0,y.jsx)(lr,{kind:e.kind,frontmatter:{...e.frontmatter,downloads:C},className:"mb-8 pt-9"}),!p&&(0,y.jsx)("div",{className:"block my-10 lg:sticky lg:z-10 lg:h-0 lg:pt-0 lg:my-0 lg:ml-10 lg:col-margin-right",style:{top:a},children:(0,y.jsx)(yt,{className:"relative mt-9",maxdepth:x,isMargin:A})}),(o==null?void 0:o.enabled)&&o.features.notebookCompute&&e.kind===T.Notebook&&(0,y.jsx)(vt,{showLaunch:!0}),(o==null?void 0:o.enabled)&&e.kind===T.Article&&(0,y.jsx)(ht,{pageSlug:e.slug}),(0,y.jsx)("div",{id:"skip-to-article"}),(0,y.jsx)(It,{parts:_,keywords:S,hideKeywords:n}),(0,y.jsx)(wt,{pageKind:e.kind,mdast:P}),(0,y.jsx)(Dt,{parts:_}),(0,y.jsx)(Nt,{}),(0,y.jsx)(Pt,{}),(0,y.jsx)(gt,{}),!g&&!t&&(0,y.jsx)(Ct,{links:e.footer})]})})})});var u=l(N()),Zn=({data:e,matches:t,location:n})=>{var s,h,g,p,x,C,P,S,_,A,k;if(!e)return[];let r=e.config,o=e.project,a=e.page.frontmatter,i=(h=(s=r==null?void 0:r.title)!=null?s:o==null?void 0:o.title)!=null?h:"";return Tt({origin:"",url:n.pathname,title:a!=null&&a.title?`${a.title}${i?` - ${i}`:""}`:i,description:(x=(p=(g=a==null?void 0:a.description)!=null?g:o==null?void 0:o.description)!=null?p:r==null?void 0:r.description)!=null?x:void 0,image:(P=(C=(a==null?void 0:a.thumbnailOptimized)||(a==null?void 0:a.thumbnail))!=null?C:(o==null?void 0:o.thumbnailOptimized)||(o==null?void 0:o.thumbnail))!=null?P:void 0,twitter:(S=r==null?void 0:r.options)==null?void 0:S.twitter,keywords:(k=(A=(_=a==null?void 0:a.keywords)!=null?_:o==null?void 0:o.keywords)!=null?A:r==null?void 0:r.keywords)!=null?k:[]})},qn=()=>[zt];function po({children:e,hide_toc:t,hideSearch:n,projectSlug:r,inset:o=20}){let a=ce(),{container:i,toc:s}=Mt(a,o);return(0,u.jsxs)(u.Fragment,{children:[(0,u.jsx)(Et,{hideToc:t,hideSearch:n}),(0,u.jsx)(St,{sidebarRef:s,hide_toc:t,footer:(0,u.jsx)(dr,{}),projectSlug:r}),(0,u.jsx)(Je,{children:(0,u.jsx)("article",{ref:i,className:"article content article-grid grid-gap",children:e})})]})}function ur({children:e,hide_toc:t,hideSearch:n,projectSlug:r,inset:o=20}){return(0,u.jsx)(We,{children:(0,u.jsx)(po,{children:e,hide_toc:t,hideSearch:n,projectSlug:r,inset:o})})}function uo(){var h,g,p,x;let{container:e}=Rt(),t=$e(),n=Ke(),r=(g=(h=t.page.frontmatter)==null?void 0:h.site)!=null?g:{},o=(x=(p=se())==null?void 0:p.options)!=null?x:{},{hide_toc:a,hide_search:i,hide_footer_links:s}={...o,...r};return(0,u.jsx)(ur,{hide_toc:a,hideSearch:i,projectSlug:t.page.project,children:(0,u.jsx)(Ye,{children:(0,u.jsx)(ft,{features:{notebookCompute:!0,figureCompute:!0,launchBinder:!1},children:(0,u.jsx)(ut,{baseurl:n,children:(0,u.jsx)("main",{ref:e,className:"article-grid subgrid-gap col-screen",children:(0,u.jsx)(fr,{article:t.page,hide_all_footer_links:s})})})})})})}function Qn(){let e=He();return(0,u.jsx)(ur,{children:(0,u.jsx)("main",{className:"article",children:Le(e)?(0,u.jsx)(jt,{}):(0,u.jsx)(Ft,{error:e})})})}export{zt as a,Zn as b,qn as c,uo as d,Qn as e}; diff --git a/build/_shared/chunk-DCZNW6LG.js b/build/_shared/chunk-DCZNW6LG.js deleted file mode 100644 index cd60a37..0000000 --- a/build/_shared/chunk-DCZNW6LG.js +++ /dev/null @@ -1,206 +0,0 @@ -import{b as dc,g as o1,h as e_,k as Jq}from"/build/_shared/chunk-YAIQ7LUU.js";import{e as hk,f as fk,g as Vi}from"/build/_shared/chunk-HTHE5KDW.js";import{a as se,b as Z,c as Vt,d as Pn,e as Yn,f as $b,g as ke,h as Bs,i as Ub,j as uc,k as uk,l as dk,m as $r,n as qb,o as Et,p as Wt,q as Vb,r as pk}from"/build/_shared/chunk-JCLNTD6A.js";import{a as Ws,b as Ck,c as Yp,d as s1,e as Xq,f as ra,g as Zb,h as Cn,i as Qb,j as $s,k as Sk,l as Su}from"/build/_shared/chunk-HYMQ7M2K.js";import{a as Mt}from"/build/_shared/chunk-3CVK3PYF.js";import{b as Kp}from"/build/_shared/chunk-J6FHCSRC.js";import{B as vk,b as ak,q as lk,r as ck,s as cc,t as mk,u as wu,w as Lh,z as gk}from"/build/_shared/chunk-NF5NQVJX.js";import{$ as Lk,A as Vp,B as $q,I as Gp,J as xk,K as i1,O as yk,Q as n1,R as Uq,S as qq,T as Vq,U as Gq,V as Yq,W as Kq,X as wk,Y as r1,Z as t_,_ as Ek,a as Fs,aa as Zq,b as Ih,ba as Ik,c as vt,ca as Qq,d as bk,da as Us,e as Cu,ea as a1,f as Gi,fa as Mk,g as cr,ga as e6,h as Te,ha as Ak,i as Hs,ia as t6,j as _k,k as Gb,l as bt,m as qp,n as t1,o as mn,p as Yb,q as js,r as Kb,s as ur,t as Xb,u as Le,v as ri,w as Jb,z as Eo}from"/build/_shared/chunk-5CFTM6YW.js";import{a as JR,b as ZR,c as QR,d as lc,e as ek,g as QC,h as e1,i as tk,j as ik,k as nk,m as rk,n as sk,p as ok}from"/build/_shared/chunk-OCTKKCIL.js";import{a as le,b as Wb,c as Wq,d as me}from"/build/_shared/chunk-UAI5KRM7.js";import{b as $,c as Ye,d as Eh,e as P,f as Ka}from"/build/_shared/chunk-2NH4LW52.js";var PO=Ye((khe,sl)=>{function lS(t){return sl.exports=lS=typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?function(e){return typeof e}:function(e){return e&&typeof Symbol=="function"&&e.constructor===Symbol&&e!==Symbol.prototype?"symbol":typeof e},sl.exports.__esModule=!0,sl.exports.default=sl.exports,lS(t)}sl.exports=lS,sl.exports.__esModule=!0,sl.exports.default=sl.exports});var FO=Ye((Nhe,ol)=>{var zO=PO().default;function BO(){"use strict";ol.exports=BO=function(){return e},ol.exports.__esModule=!0,ol.exports.default=ol.exports;var t,e={},i=Object.prototype,n=i.hasOwnProperty,r=Object.defineProperty||function(K,V,ne){K[V]=ne.value},s=typeof Symbol=="function"?Symbol:{},o=s.iterator||"@@iterator",a=s.asyncIterator||"@@asyncIterator",l=s.toStringTag||"@@toStringTag";function c(K,V,ne){return Object.defineProperty(K,V,{value:ne,enumerable:!0,configurable:!0,writable:!0}),K[V]}try{c({},"")}catch{c=function(ne,_e,Pe){return ne[_e]=Pe}}function u(K,V,ne,_e){var Pe=V&&V.prototype instanceof y?V:y,Ce=Object.create(Pe.prototype),Ae=new Q(_e||[]);return r(Ce,"_invoke",{value:w(K,ne,Ae)}),Ce}function d(K,V,ne){try{return{type:"normal",arg:K.call(V,ne)}}catch(_e){return{type:"throw",arg:_e}}}e.wrap=u;var f="suspendedStart",h="suspendedYield",m="executing",p="completed",v={};function y(){}function C(){}function M(){}var O={};c(O,o,function(){return this});var R=Object.getPrototypeOf,_=R&&R(R(X([])));_&&_!==i&&n.call(_,o)&&(O=_);var L=M.prototype=y.prototype=Object.create(O);function S(K){["next","throw","return"].forEach(function(V){c(K,V,function(ne){return this._invoke(V,ne)})})}function x(K,V){function ne(Pe,Ce,Ae,ut){var Xe=d(K[Pe],K,Ce);if(Xe.type!=="throw"){var tt=Xe.arg,ht=tt.value;return ht&&zO(ht)=="object"&&n.call(ht,"__await")?V.resolve(ht.__await).then(function(St){ne("next",St,Ae,ut)},function(St){ne("throw",St,Ae,ut)}):V.resolve(ht).then(function(St){tt.value=St,Ae(tt)},function(St){return ne("throw",St,Ae,ut)})}ut(Xe.arg)}var _e;r(this,"_invoke",{value:function(Ce,Ae){function ut(){return new V(function(Xe,tt){ne(Ce,Ae,Xe,tt)})}return _e=_e?_e.then(ut,ut):ut()}})}function w(K,V,ne){var _e=f;return function(Pe,Ce){if(_e===m)throw Error("Generator is already running");if(_e===p){if(Pe==="throw")throw Ce;return{value:t,done:!0}}for(ne.method=Pe,ne.arg=Ce;;){var Ae=ne.delegate;if(Ae){var ut=E(Ae,ne);if(ut){if(ut===v)continue;return ut}}if(ne.method==="next")ne.sent=ne._sent=ne.arg;else if(ne.method==="throw"){if(_e===f)throw _e=p,ne.arg;ne.dispatchException(ne.arg)}else ne.method==="return"&&ne.abrupt("return",ne.arg);_e=m;var Xe=d(K,V,ne);if(Xe.type==="normal"){if(_e=ne.done?p:h,Xe.arg===v)continue;return{value:Xe.arg,done:ne.done}}Xe.type==="throw"&&(_e=p,ne.method="throw",ne.arg=Xe.arg)}}}function E(K,V){var ne=V.method,_e=K.iterator[ne];if(_e===t)return V.delegate=null,ne==="throw"&&K.iterator.return&&(V.method="return",V.arg=t,E(K,V),V.method==="throw")||ne!=="return"&&(V.method="throw",V.arg=new TypeError("The iterator does not provide a '"+ne+"' method")),v;var Pe=d(_e,K.iterator,V.arg);if(Pe.type==="throw")return V.method="throw",V.arg=Pe.arg,V.delegate=null,v;var Ce=Pe.arg;return Ce?Ce.done?(V[K.resultName]=Ce.value,V.next=K.nextLoc,V.method!=="return"&&(V.method="next",V.arg=t),V.delegate=null,v):Ce:(V.method="throw",V.arg=new TypeError("iterator result is not an object"),V.delegate=null,v)}function N(K){var V={tryLoc:K[0]};1 in K&&(V.catchLoc=K[1]),2 in K&&(V.finallyLoc=K[2],V.afterLoc=K[3]),this.tryEntries.push(V)}function B(K){var V=K.completion||{};V.type="normal",delete V.arg,K.completion=V}function Q(K){this.tryEntries=[{tryLoc:"root"}],K.forEach(N,this),this.reset(!0)}function X(K){if(K||K===""){var V=K[o];if(V)return V.call(K);if(typeof K.next=="function")return K;if(!isNaN(K.length)){var ne=-1,_e=function Pe(){for(;++ne=0;--Pe){var Ce=this.tryEntries[Pe],Ae=Ce.completion;if(Ce.tryLoc==="root")return _e("end");if(Ce.tryLoc<=this.prev){var ut=n.call(Ce,"catchLoc"),Xe=n.call(Ce,"finallyLoc");if(ut&&Xe){if(this.prev=0;--_e){var Pe=this.tryEntries[_e];if(Pe.tryLoc<=this.prev&&n.call(Pe,"finallyLoc")&&this.prev=0;--ne){var _e=this.tryEntries[ne];if(_e.finallyLoc===V)return this.complete(_e.completion,_e.afterLoc),B(_e),v}},catch:function(V){for(var ne=this.tryEntries.length-1;ne>=0;--ne){var _e=this.tryEntries[ne];if(_e.tryLoc===V){var Pe=_e.completion;if(Pe.type==="throw"){var Ce=Pe.arg;B(_e)}return Ce}}throw Error("illegal catch attempt")},delegateYield:function(V,ne,_e){return this.delegate={iterator:X(V),resultName:ne,nextLoc:_e},this.method==="next"&&(this.arg=t),v}},e}ol.exports=BO,ol.exports.__esModule=!0,ol.exports.default=ol.exports});var cS=Ye((Dhe,HO)=>{var j_=FO()();HO.exports=j_;try{regeneratorRuntime=j_}catch{typeof globalThis=="object"?globalThis.regeneratorRuntime=j_:Function("r","regeneratorRuntime = r")(j_)}});var QO=Ye(ZO=>{"use strict";var Ef=le();function $V(t,e){return t===e&&(t!==0||1/t===1/e)||t!==t&&e!==e}var UV=typeof Object.is=="function"?Object.is:$V,qV=Ef.useState,VV=Ef.useEffect,GV=Ef.useLayoutEffect,YV=Ef.useDebugValue;function KV(t,e){var i=e(),n=qV({inst:{value:i,getSnapshot:e}}),r=n[0].inst,s=n[1];return GV(function(){r.value=i,r.getSnapshot=e,pS(r)&&s({inst:r})},[t,i,e]),VV(function(){return pS(r)&&s({inst:r}),t(function(){pS(r)&&s({inst:r})})},[t]),YV(i),i}function pS(t){var e=t.getSnapshot;t=t.value;try{var i=e();return!UV(t,i)}catch{return!0}}function XV(t,e){return e()}var JV=typeof window>"u"||typeof window.document>"u"||typeof window.document.createElement>"u"?XV:KV;ZO.useSyncExternalStore=Ef.useSyncExternalStore!==void 0?Ef.useSyncExternalStore:JV});var gS=Ye((mfe,eP)=>{"use strict";eP.exports=QO()});var DS=Ye((Yme,fY)=>{fY.exports=["0BSD","3D-Slicer-1.0","AAL","ADSL","AFL-1.1","AFL-1.2","AFL-2.0","AFL-2.1","AFL-3.0","AGPL-1.0-only","AGPL-1.0-or-later","AGPL-3.0-only","AGPL-3.0-or-later","AMD-newlib","AMDPLPA","AML","AML-glslang","AMPAS","ANTLR-PD","ANTLR-PD-fallback","APAFML","APL-1.0","APSL-1.0","APSL-1.1","APSL-1.2","APSL-2.0","ASWF-Digital-Assets-1.0","ASWF-Digital-Assets-1.1","Abstyles","AdaCore-doc","Adobe-2006","Adobe-Display-PostScript","Adobe-Glyph","Adobe-Utopia","Afmparse","Aladdin","Apache-1.0","Apache-1.1","Apache-2.0","App-s2p","Arphic-1999","Artistic-1.0","Artistic-1.0-Perl","Artistic-1.0-cl8","Artistic-2.0","BSD-1-Clause","BSD-2-Clause","BSD-2-Clause-Darwin","BSD-2-Clause-Patent","BSD-2-Clause-Views","BSD-2-Clause-first-lines","BSD-3-Clause","BSD-3-Clause-Attribution","BSD-3-Clause-Clear","BSD-3-Clause-HP","BSD-3-Clause-LBNL","BSD-3-Clause-Modification","BSD-3-Clause-No-Military-License","BSD-3-Clause-No-Nuclear-License","BSD-3-Clause-No-Nuclear-License-2014","BSD-3-Clause-No-Nuclear-Warranty","BSD-3-Clause-Open-MPI","BSD-3-Clause-Sun","BSD-3-Clause-acpica","BSD-3-Clause-flex","BSD-4-Clause","BSD-4-Clause-Shortened","BSD-4-Clause-UC","BSD-4.3RENO","BSD-4.3TAHOE","BSD-Advertising-Acknowledgement","BSD-Attribution-HPND-disclaimer","BSD-Inferno-Nettverk","BSD-Protection","BSD-Source-Code","BSD-Source-beginning-file","BSD-Systemics","BSD-Systemics-W3Works","BSL-1.0","BUSL-1.1","Baekmuk","Bahyph","Barr","Beerware","BitTorrent-1.0","BitTorrent-1.1","Bitstream-Charter","Bitstream-Vera","BlueOak-1.0.0","Boehm-GC","Borceux","Brian-Gladman-2-Clause","Brian-Gladman-3-Clause","C-UDA-1.0","CAL-1.0","CAL-1.0-Combined-Work-Exception","CATOSL-1.1","CC-BY-1.0","CC-BY-2.0","CC-BY-2.5","CC-BY-2.5-AU","CC-BY-3.0","CC-BY-3.0-AT","CC-BY-3.0-AU","CC-BY-3.0-DE","CC-BY-3.0-IGO","CC-BY-3.0-NL","CC-BY-3.0-US","CC-BY-4.0","CC-BY-NC-1.0","CC-BY-NC-2.0","CC-BY-NC-2.5","CC-BY-NC-3.0","CC-BY-NC-3.0-DE","CC-BY-NC-4.0","CC-BY-NC-ND-1.0","CC-BY-NC-ND-2.0","CC-BY-NC-ND-2.5","CC-BY-NC-ND-3.0","CC-BY-NC-ND-3.0-DE","CC-BY-NC-ND-3.0-IGO","CC-BY-NC-ND-4.0","CC-BY-NC-SA-1.0","CC-BY-NC-SA-2.0","CC-BY-NC-SA-2.0-DE","CC-BY-NC-SA-2.0-FR","CC-BY-NC-SA-2.0-UK","CC-BY-NC-SA-2.5","CC-BY-NC-SA-3.0","CC-BY-NC-SA-3.0-DE","CC-BY-NC-SA-3.0-IGO","CC-BY-NC-SA-4.0","CC-BY-ND-1.0","CC-BY-ND-2.0","CC-BY-ND-2.5","CC-BY-ND-3.0","CC-BY-ND-3.0-DE","CC-BY-ND-4.0","CC-BY-SA-1.0","CC-BY-SA-2.0","CC-BY-SA-2.0-UK","CC-BY-SA-2.1-JP","CC-BY-SA-2.5","CC-BY-SA-3.0","CC-BY-SA-3.0-AT","CC-BY-SA-3.0-DE","CC-BY-SA-3.0-IGO","CC-BY-SA-4.0","CC-PDDC","CC0-1.0","CDDL-1.0","CDDL-1.1","CDL-1.0","CDLA-Permissive-1.0","CDLA-Permissive-2.0","CDLA-Sharing-1.0","CECILL-1.0","CECILL-1.1","CECILL-2.0","CECILL-2.1","CECILL-B","CECILL-C","CERN-OHL-1.1","CERN-OHL-1.2","CERN-OHL-P-2.0","CERN-OHL-S-2.0","CERN-OHL-W-2.0","CFITSIO","CMU-Mach","CMU-Mach-nodoc","CNRI-Jython","CNRI-Python","CNRI-Python-GPL-Compatible","COIL-1.0","CPAL-1.0","CPL-1.0","CPOL-1.02","CUA-OPL-1.0","Caldera","Caldera-no-preamble","Catharon","ClArtistic","Clips","Community-Spec-1.0","Condor-1.1","Cornell-Lossless-JPEG","Cronyx","Crossword","CrystalStacker","Cube","D-FSL-1.0","DEC-3-Clause","DL-DE-BY-2.0","DL-DE-ZERO-2.0","DOC","DRL-1.0","DRL-1.1","DSDP","DocBook-Schema","DocBook-XML","Dotseqn","ECL-1.0","ECL-2.0","EFL-1.0","EFL-2.0","EPICS","EPL-1.0","EPL-2.0","EUDatagrid","EUPL-1.0","EUPL-1.1","EUPL-1.2","Elastic-2.0","Entessa","ErlPL-1.1","Eurosym","FBM","FDK-AAC","FSFAP","FSFAP-no-warranty-disclaimer","FSFUL","FSFULLR","FSFULLRWD","FTL","Fair","Ferguson-Twofish","Frameworx-1.0","FreeBSD-DOC","FreeImage","Furuseth","GCR-docs","GD","GFDL-1.1-invariants-only","GFDL-1.1-invariants-or-later","GFDL-1.1-no-invariants-only","GFDL-1.1-no-invariants-or-later","GFDL-1.1-only","GFDL-1.1-or-later","GFDL-1.2-invariants-only","GFDL-1.2-invariants-or-later","GFDL-1.2-no-invariants-only","GFDL-1.2-no-invariants-or-later","GFDL-1.2-only","GFDL-1.2-or-later","GFDL-1.3-invariants-only","GFDL-1.3-invariants-or-later","GFDL-1.3-no-invariants-only","GFDL-1.3-no-invariants-or-later","GFDL-1.3-only","GFDL-1.3-or-later","GL2PS","GLWTPL","GPL-1.0-only","GPL-1.0-or-later","GPL-2.0-only","GPL-2.0-or-later","GPL-3.0-only","GPL-3.0-or-later","Giftware","Glide","Glulxe","Graphics-Gems","Gutmann","HIDAPI","HP-1986","HP-1989","HPND","HPND-DEC","HPND-Fenneberg-Livingston","HPND-INRIA-IMAG","HPND-Intel","HPND-Kevlin-Henney","HPND-MIT-disclaimer","HPND-Markus-Kuhn","HPND-Netrek","HPND-Pbmplus","HPND-UC","HPND-UC-export-US","HPND-doc","HPND-doc-sell","HPND-export-US","HPND-export-US-acknowledgement","HPND-export-US-modify","HPND-export2-US","HPND-merchantability-variant","HPND-sell-MIT-disclaimer-xserver","HPND-sell-regexpr","HPND-sell-variant","HPND-sell-variant-MIT-disclaimer","HPND-sell-variant-MIT-disclaimer-rev","HTMLTIDY","HaskellReport","Hippocratic-2.1","IBM-pibs","ICU","IEC-Code-Components-EULA","IJG","IJG-short","IPA","IPL-1.0","ISC","ISC-Veillard","ImageMagick","Imlib2","Info-ZIP","Inner-Net-2.0","Intel","Intel-ACPI","Interbase-1.0","JPL-image","JPNIC","JSON","Jam","JasPer-2.0","Kastrup","Kazlib","Knuth-CTAN","LAL-1.2","LAL-1.3","LGPL-2.0-only","LGPL-2.0-or-later","LGPL-2.1-only","LGPL-2.1-or-later","LGPL-3.0-only","LGPL-3.0-or-later","LGPLLR","LOOP","LPD-document","LPL-1.0","LPL-1.02","LPPL-1.0","LPPL-1.1","LPPL-1.2","LPPL-1.3a","LPPL-1.3c","LZMA-SDK-9.11-to-9.20","LZMA-SDK-9.22","Latex2e","Latex2e-translated-notice","Leptonica","LiLiQ-P-1.1","LiLiQ-R-1.1","LiLiQ-Rplus-1.1","Libpng","Linux-OpenIB","Linux-man-pages-1-para","Linux-man-pages-copyleft","Linux-man-pages-copyleft-2-para","Linux-man-pages-copyleft-var","Lucida-Bitmap-Fonts","MIT","MIT-0","MIT-CMU","MIT-Festival","MIT-Khronos-old","MIT-Modern-Variant","MIT-Wu","MIT-advertising","MIT-enna","MIT-feh","MIT-open-group","MIT-testregex","MITNFA","MMIXware","MPEG-SSG","MPL-1.0","MPL-1.1","MPL-2.0","MPL-2.0-no-copyleft-exception","MS-LPL","MS-PL","MS-RL","MTLL","Mackerras-3-Clause","Mackerras-3-Clause-acknowledgment","MakeIndex","Martin-Birgmeier","McPhee-slideshow","Minpack","MirOS","Motosoto","MulanPSL-1.0","MulanPSL-2.0","Multics","Mup","NAIST-2003","NASA-1.3","NBPL-1.0","NCBI-PD","NCGL-UK-2.0","NCL","NCSA","NGPL","NICTA-1.0","NIST-PD","NIST-PD-fallback","NIST-Software","NLOD-1.0","NLOD-2.0","NLPL","NOSL","NPL-1.0","NPL-1.1","NPOSL-3.0","NRL","NTP","NTP-0","Naumen","NetCDF","Newsletr","Nokia","Noweb","O-UDA-1.0","OAR","OCCT-PL","OCLC-2.0","ODC-By-1.0","ODbL-1.0","OFFIS","OFL-1.0","OFL-1.0-RFN","OFL-1.0-no-RFN","OFL-1.1","OFL-1.1-RFN","OFL-1.1-no-RFN","OGC-1.0","OGDL-Taiwan-1.0","OGL-Canada-2.0","OGL-UK-1.0","OGL-UK-2.0","OGL-UK-3.0","OGTSL","OLDAP-1.1","OLDAP-1.2","OLDAP-1.3","OLDAP-1.4","OLDAP-2.0","OLDAP-2.0.1","OLDAP-2.1","OLDAP-2.2","OLDAP-2.2.1","OLDAP-2.2.2","OLDAP-2.3","OLDAP-2.4","OLDAP-2.5","OLDAP-2.6","OLDAP-2.7","OLDAP-2.8","OLFL-1.3","OML","OPL-1.0","OPL-UK-3.0","OPUBL-1.0","OSET-PL-2.1","OSL-1.0","OSL-1.1","OSL-2.0","OSL-2.1","OSL-3.0","OpenPBS-2.3","OpenSSL","OpenSSL-standalone","OpenVision","PADL","PDDL-1.0","PHP-3.0","PHP-3.01","PPL","PSF-2.0","Parity-6.0.0","Parity-7.0.0","Pixar","Plexus","PolyForm-Noncommercial-1.0.0","PolyForm-Small-Business-1.0.0","PostgreSQL","Python-2.0","Python-2.0.1","QPL-1.0","QPL-1.0-INRIA-2004","Qhull","RHeCos-1.1","RPL-1.1","RPL-1.5","RPSL-1.0","RSA-MD","RSCPL","Rdisc","Ruby","Ruby-pty","SAX-PD","SAX-PD-2.0","SCEA","SGI-B-1.0","SGI-B-1.1","SGI-B-2.0","SGI-OpenGL","SGP4","SHL-0.5","SHL-0.51","SISSL","SISSL-1.2","SL","SMLNJ","SMPPL","SNIA","SPL-1.0","SSH-OpenSSH","SSH-short","SSLeay-standalone","SSPL-1.0","SWL","Saxpath","SchemeReport","Sendmail","Sendmail-8.23","SimPL-2.0","Sleepycat","Soundex","Spencer-86","Spencer-94","Spencer-99","SugarCRM-1.1.3","Sun-PPP","Sun-PPP-2000","SunPro","Symlinks","TAPR-OHL-1.0","TCL","TCP-wrappers","TGPPL-1.0","TMate","TORQUE-1.1","TOSL","TPDL","TPL-1.0","TTWL","TTYP0","TU-Berlin-1.0","TU-Berlin-2.0","TermReadKey","UCAR","UCL-1.0","UMich-Merit","UPL-1.0","URT-RLE","Ubuntu-font-1.0","Unicode-3.0","Unicode-DFS-2015","Unicode-DFS-2016","Unicode-TOU","UnixCrypt","Unlicense","VOSTROM","VSL-1.0","Vim","W3C","W3C-19980720","W3C-20150513","WTFPL","Watcom-1.0","Widget-Workshop","Wsuipa","X11","X11-distribute-modifications-variant","X11-swapped","XFree86-1.1","XSkat","Xdebug-1.03","Xerox","Xfig","Xnet","YPL-1.0","YPL-1.1","ZPL-1.1","ZPL-2.0","ZPL-2.1","Zed","Zeeff","Zend-2.0","Zimbra-1.3","Zimbra-1.4","Zlib","any-OSI","bcrypt-Solar-Designer","blessing","bzip2-1.0.6","check-cvs","checkmk","copyleft-next-0.3.0","copyleft-next-0.3.1","curl","cve-tou","diffmark","dtoa","dvipdfm","eGenix","etalab-2.0","fwlw","gSOAP-1.3b","gnuplot","gtkbook","hdparm","iMatix","libpng-2.0","libselinux-1.0","libtiff","libutil-David-Nugent","lsof","magaz","mailprio","metamail","mpi-permissive","mpich2","mplus","pkgconf","pnmstitch","psfrag","psutils","python-ldap","radvd","snprintf","softSurfer","ssh-keyscan","swrule","threeparttable","ulem","w3m","xinetd","xkeyboard-config-Zinoviev","xlock","xpp","xzoom","zlib-acknowledgement"]});var UP=Ye((Kme,mY)=>{mY.exports=["AGPL-1.0","AGPL-3.0","BSD-2-Clause-FreeBSD","BSD-2-Clause-NetBSD","GFDL-1.1","GFDL-1.2","GFDL-1.3","GPL-1.0","GPL-2.0","GPL-2.0-with-GCC-exception","GPL-2.0-with-autoconf-exception","GPL-2.0-with-bison-exception","GPL-2.0-with-classpath-exception","GPL-2.0-with-font-exception","GPL-3.0","GPL-3.0-with-GCC-exception","GPL-3.0-with-autoconf-exception","LGPL-2.0","LGPL-2.1","LGPL-3.0","Net-SNMP","Nunit","StandardML-NJ","bzip2-1.0.5","eCos-2.0","wxWindows"]});var qP=Ye((Xme,pY)=>{pY.exports=["389-exception","Asterisk-exception","Autoconf-exception-2.0","Autoconf-exception-3.0","Autoconf-exception-generic","Autoconf-exception-generic-3.0","Autoconf-exception-macro","Bison-exception-1.24","Bison-exception-2.2","Bootloader-exception","Classpath-exception-2.0","CLISP-exception-2.0","cryptsetup-OpenSSL-exception","DigiRule-FOSS-exception","eCos-exception-2.0","Fawkes-Runtime-exception","FLTK-exception","fmt-exception","Font-exception-2.0","freertos-exception-2.0","GCC-exception-2.0","GCC-exception-2.0-note","GCC-exception-3.1","Gmsh-exception","GNAT-exception","GNOME-examples-exception","GNU-compiler-exception","gnu-javamail-exception","GPL-3.0-interface-exception","GPL-3.0-linking-exception","GPL-3.0-linking-source-exception","GPL-CC-1.0","GStreamer-exception-2005","GStreamer-exception-2008","i2p-gpl-java-exception","KiCad-libraries-exception","LGPL-3.0-linking-exception","libpri-OpenH323-exception","Libtool-exception","Linux-syscall-note","LLGPL","LLVM-exception","LZMA-exception","mif-exception","OCaml-LGPL-linking-exception","OCCT-exception-1.0","OpenJDK-assembly-exception-1.0","openvpn-openssl-exception","PS-or-PDF-font-exception-20170817","QPL-1.0-INRIA-2004-exception","Qt-GPL-exception-1.0","Qt-LGPL-exception-1.1","Qwt-exception-1.0","SANE-exception","SHL-2.0","SHL-2.1","stunnel-exception","SWI-exception","Swift-exception","Texinfo-exception","u-boot-exception-2.0","UBDL-exception","Universal-FOSS-exception-1.0","vsftpd-openssl-exception","WxWindows-exception-3.1","x11vnc-openssl-exception"]});var GP=Ye((Jme,VP)=>{"use strict";var gY=[].concat(DS()).concat(UP()),vY=qP();VP.exports=function(t){var e=0;function i(){return e1&&t[e-2]===" ")throw new Error("Space before `+`");return m&&{type:"OPERATOR",string:m}}function o(){return n(/[A-Za-z0-9-.]+/)}function a(){var m=o();if(!m)throw new Error("Expected idstring at offset "+e);return m}function l(){if(n("DocumentRef-")){var m=a();return{type:"DOCUMENTREF",string:m}}}function c(){if(n("LicenseRef-")){var m=a();return{type:"LICENSEREF",string:m}}}function u(){var m=e,p=o();if(gY.indexOf(p)!==-1)return{type:"LICENSE",string:p};if(vY.indexOf(p)!==-1)return{type:"EXCEPTION",string:p};e=m}function d(){return s()||l()||c()||u()}for(var f=[];i()&&(r(),!!i());){var h=d();if(!h)throw new Error("Unexpected `"+t[e]+"` at offset "+e);f.push(h)}return f}});var KP=Ye((Zme,YP)=>{"use strict";YP.exports=function(t){var e=0;function i(){return e{"use strict";var bY=GP(),_Y=KP();XP.exports=function(t){return _Y(bY(t))}});var o2=Ye((epe,s2)=>{var xY=JP(),yY=DS();function h0(t){try{return xY(t),!0}catch{return!1}}function r2(t,e){var i=e[0].length-t[0].length;return i!==0?i:t[0].toUpperCase().localeCompare(e[0].toUpperCase())}var ZP=[["APGL","AGPL"],["Gpl","GPL"],["GLP","GPL"],["APL","Apache"],["ISD","ISC"],["GLP","GPL"],["IST","ISC"],["Claude","Clause"],[" or later","+"],[" International",""],["GNU","GPL"],["GUN","GPL"],["+",""],["GNU GPL","GPL"],["GNU LGPL","LGPL"],["GNU/GPL","GPL"],["GNU GLP","GPL"],["GNU LESSER GENERAL PUBLIC LICENSE","LGPL"],["GNU Lesser General Public License","LGPL"],["GNU LESSER GENERAL PUBLIC LICENSE","LGPL-2.1"],["GNU Lesser General Public License","LGPL-2.1"],["LESSER GENERAL PUBLIC LICENSE","LGPL"],["Lesser General Public License","LGPL"],["LESSER GENERAL PUBLIC LICENSE","LGPL-2.1"],["Lesser General Public License","LGPL-2.1"],["GNU General Public License","GPL"],["Gnu public license","GPL"],["GNU Public License","GPL"],["GNU GENERAL PUBLIC LICENSE","GPL"],["MTI","MIT"],["Mozilla Public License","MPL"],["Universal Permissive License","UPL"],["WTH","WTF"],["WTFGPL","WTFPL"],["-License",""]].sort(r2),wY=0,CY=1,QP=[function(t){return t.toUpperCase()},function(t){return t.trim()},function(t){return t.replace(/\./g,"")},function(t){return t.replace(/\s+/g,"")},function(t){return t.replace(/\s+/g,"-")},function(t){return t.replace("v","-")},function(t){return t.replace(/,?\s*(\d)/,"-$1")},function(t){return t.replace(/,?\s*(\d)/,"-$1.0")},function(t){return t.replace(/,?\s*(V\.|v\.|V|v|Version|version)\s*(\d)/,"-$2")},function(t){return t.replace(/,?\s*(V\.|v\.|V|v|Version|version)\s*(\d)/,"-$2.0")},function(t){return t[0].toUpperCase()+t.slice(1)},function(t){return t.replace("/","-")},function(t){return t.replace(/\s*V\s*(\d)/,"-$1").replace(/(\d)$/,"$1.0")},function(t){return t.indexOf("3.0")!==-1?t+"-or-later":t+"-only"},function(t){return t+"only"},function(t){return t.replace(/(\d)$/,"-$1.0")},function(t){return t.replace(/(-| )?(\d)$/,"-$2-Clause")},function(t){return t.replace(/(-| )clause(-| )(\d)/,"-$3-Clause")},function(t){return t.replace(/\b(Modified|New|Revised)(-| )?BSD((-| )License)?/i,"BSD-3-Clause")},function(t){return t.replace(/\bSimplified(-| )?BSD((-| )License)?/i,"BSD-2-Clause")},function(t){return t.replace(/\b(Free|Net)(-| )?BSD((-| )License)?/i,"BSD-2-Clause-$1BSD")},function(t){return t.replace(/\bClear(-| )?BSD((-| )License)?/i,"BSD-3-Clause-Clear")},function(t){return t.replace(/\b(Old|Original)(-| )?BSD((-| )License)?/i,"BSD-4-Clause")},function(t){return"CC-"+t},function(t){return"CC-"+t+"-4.0"},function(t){return t.replace("Attribution","BY").replace("NonCommercial","NC").replace("NoDerivatives","ND").replace(/ (\d)/,"-$1").replace(/ ?International/,"")},function(t){return"CC-"+t.replace("Attribution","BY").replace("NonCommercial","NC").replace("NoDerivatives","ND").replace(/ (\d)/,"-$1").replace(/ ?International/,"")+"-4.0"}],OS=yY.map(function(t){var e=/^(.*)-\d+\.\d+$/.exec(t);return e?[e[0],e[1]]:[t,null]}).reduce(function(t,e){var i=e[1];return t[i]=t[i]||[],t[i].push(e[0]),t},{}),SY=Object.keys(OS).map(function(e){return[e,OS[e]]}).filter(function(e){return e[1].length===1&&e[0]!==null&&e[0]!=="APL"}).map(function(e){return[e[0],e[1][0]]});OS=void 0;var e2=[["UNLI","Unlicense"],["WTF","WTFPL"],["2 CLAUSE","BSD-2-Clause"],["2-CLAUSE","BSD-2-Clause"],["3 CLAUSE","BSD-3-Clause"],["3-CLAUSE","BSD-3-Clause"],["AFFERO","AGPL-3.0-or-later"],["AGPL","AGPL-3.0-or-later"],["APACHE","Apache-2.0"],["ARTISTIC","Artistic-2.0"],["Affero","AGPL-3.0-or-later"],["BEER","Beerware"],["BOOST","BSL-1.0"],["BSD","BSD-2-Clause"],["CDDL","CDDL-1.1"],["ECLIPSE","EPL-1.0"],["FUCK","WTFPL"],["GNU","GPL-3.0-or-later"],["LGPL","LGPL-3.0-or-later"],["GPLV1","GPL-1.0-only"],["GPL-1","GPL-1.0-only"],["GPLV2","GPL-2.0-only"],["GPL-2","GPL-2.0-only"],["GPL","GPL-3.0-or-later"],["MIT +NO-FALSE-ATTRIBS","MITNFA"],["MIT","MIT"],["MPL","MPL-2.0"],["X11","X11"],["ZLIB","Zlib"]].concat(SY).sort(r2),EY=0,LY=1,t2=function(t){for(var e=0;e-1)return n[LY]}return null},n2=function(t,e){for(var i=0;i-1){var s=t.replace(r,n[CY]),o=e(s);if(o!==null)return o}}return null};s2.exports=function(t,e){e=e||{};var i=e.upgrade===void 0?!0:!!e.upgrade;function n(a){return i?IY(a):a}var r=typeof t=="string"&&t.trim().length!==0;if(!r)throw Error("Invalid argument. Expected non-empty string.");if(t=t.trim(),h0(t))return n(t);var s=t.replace(/\+$/,"").trim();if(h0(s))return n(s);var o=t2(t);return o!==null||(o=n2(t,function(a){return h0(a)?a:t2(a)}),o!==null)||(o=i2(t),o!==null)||(o=n2(t,i2),o!==null)?n(o):null};function IY(t){return["GPL-1.0","LGPL-1.0","AGPL-1.0","GPL-2.0","LGPL-2.0","AGPL-2.0","LGPL-2.1"].indexOf(t)!==-1?t+"-only":["GPL-1.0+","GPL-2.0+","GPL-3.0+","LGPL-2.0+","LGPL-2.1+","LGPL-3.0+","AGPL-1.0+","AGPL-3.0+"].indexOf(t)!==-1?t.replace(/\+$/,"-or-later"):["GPL-3.0","LGPL-3.0","AGPL-3.0"].indexOf(t)!==-1?t+"-or-later":t}});var K2=Ye(b0=>{"use strict";Object.defineProperty(b0,"__esModule",{value:!0});b0.OutputAreaByRef=void 0;var xK=(Gi(),Ka(Cu)),v0=me(),yK=xK.__importDefault(le());b0.OutputAreaByRef=yK.default.forwardRef(({busy:t,content:e},i)=>(0,v0.jsx)("div",{children:(0,v0.jsxs)("div",Object.assign({className:"m-1 hover:delay-15"},{children:[(0,v0.jsx)("div",Object.assign({className:"p-1 rounded",ref:i},{children:e||"[Output Area]"})),t&&(0,v0.jsx)("div",{children:"Cell is running..."})]}))}))});var WS=Ye(jS=>{"use strict";Object.defineProperty(jS,"__esModule",{value:!0});var wK="0.4.10";jS.default=wK});function CK(t,e){let i=new URL(e);return`${t}-${i.origin+i.pathname}`}function _0(t,e,i){let n=`${e}/build/${i}`,r=`${e}/v2/${i}`;return{build:n,launch:r,storageKey:CK(t,n)}}function SK(t){if(!t.binder.repo)throw Error("repo is required for git provider");let{repo:e,binderUrl:i,ref:n}=t.binder,r=encodeURIComponent(e.replace(/(^\/)|(\/?$)/g,"")),s=i?.replace(/(\/?$)/g,""),o=`git/${r}/${n??"HEAD"}`;return _0(t.savedSessions.storagePrefix,s,o)}function EK(t){var e,i,n;if(!t.binder.repo)throw Error("repo is required for gitlab provider");let r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),o=`gl/${encodeURIComponent(((i=t.binder.repo)!==null&&i!==void 0?i:"").replace(/^(https?:\/\/)?gitlab.com\//,"").replace(/(^\/)|(\/?$)/g,""))}/${(n=t.binder.ref)!==null&&n!==void 0?n:"HEAD"}`;return _0(t.savedSessions.storagePrefix,r,o)}function LK(t){var e,i;if(!t.binder.repo)throw Error("repo is required for github provider");let n=t.binder.repo.replace(/^(https?:\/\/)?github.com\//,"").replace(/(^\/)|(\/?$)/g,""),r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),s=`gh/${n}/${(i=t.binder.ref)!==null&&i!==void 0?i:"HEAD"}`;return _0(t.savedSessions.storagePrefix,r,s)}function IK(t){var e,i;if(!t.binder.repo)throw Error("repo is required for gist provider");let n=t.binder.repo.replace(/^(https?:\/\/)?github.com\//,"").replace(/(^\/)|(\/?$)/g,""),r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),s=`gist/${n}/${(i=t.binder.ref)!==null&&i!==void 0?i:"HEAD"}`;return _0(t.savedSessions.storagePrefix,r,s)}function y0(t,e){var i,n;let r=(i=e.reduce((o,a)=>Object.assign(Object.assign({},o),{[a.name]:a}),{}))!==null&&i!==void 0?i:{},s=(n=t.binder.repoProvider)!==null&&n!==void 0?n:"github";if(!Object.keys(r).includes(s))throw Error(`Unknown provider ${t.binder.repoProvider}`);if(!r[s].makeUrls)throw Error(`No makeUrls function for ${s}`);return r[s].makeUrls(t)}var MK,AK,TK,RK,x0,X2=$(()=>{MK={name:"github",makeUrls:LK},AK={name:"gitlab",makeUrls:EK},TK={name:"git",makeUrls:SK},RK={name:"gist",makeUrls:IK},x0=[MK,AK,TK,RK]});function kK(t){let e=window.localStorage.getItem(t);if(!e)return;let i=JSON.parse(e);window.localStorage.setItem(t,JSON.stringify(Object.assign(Object.assign({},i),{lastUsed:new Date})))}function J2(t,e,i){try{let{baseUrl:n,token:r,wsUrl:s}=i;window.localStorage.setItem(t,JSON.stringify({id:e,baseUrl:n,token:r,wsUrl:s,lastUsed:new Date}))}catch(n){console.warn("Couldn't save thebe binder connection info to local storage",n)}}function Z2(t,e){return vt(this,void 0,void 0,function*(){if(!t.enabled)return null;let i=window.localStorage.getItem(e);if(i==null)return console.debug("thebe:getExistingServer No session saved in ",e),null;console.debug("thebe:getExistingServer Saved binder session found");let n=JSON.parse(i??""),r=new Date(n.lastUsed);if((new Date().getTime()-r.getTime())/1e3>t.maxAge)return console.debug(`thebe:getExistingServer Not using expired binder session for ${n.baseUrl} from ${r}`),window.localStorage.removeItem(e),null;try{yield w0.KernelAPI.listRunning(w0.ServerConnection.makeSettings(n))}catch(a){return console.debug("thebe:getExistingServer Saved binder connection appears to be invalid, requesting new session",a),window.localStorage.removeItem(e),null}return kK(e),console.debug(`thebe:getExistingServer Saved binder session is valid and will be reused ${n.baseUrl}`),n})}function Q2(t="thebe-binder"){let e=[];for(let i=0;iwindow.localStorage.removeItem(i))}function ez(t){console.debug(`thebe:clearSavedSession - removing ${t}`),window.localStorage.removeItem(t)}var w0,$S=$(()=>{Gi();w0=P(t1())});function C0(t){var e;return t.traceback?Array.isArray(t.traceback)?`${t.evalue} -${((e=t.traceback)!==null&&e!==void 0?e:[]).join("")}`:`${t.evalue} -${JSON.stringify(t.traceback)}`:t.evalue}var an,to,US,Jr,Ac,Ao,Zn,kf,zu,hl=$(()=>{(function(t){t.launching="launching",t.ready="server-ready",t.closed="closed",t.unknown="unknown"})(an||(an={}));(function(t){t.starting="starting",t.ready="ready",t.shutdown="shutdown"})(to||(to={}));(function(t){t.starting="starting",t.ready="ready",t.shutdown="shutdown"})(US||(US={}));(function(t){t.attached="attached",t.detached="detached",t.executing="executing",t.idle="idle"})(Jr||(Jr={}));(function(t){t.attached="attached",t.detached="detached",t.executing="executing",t.idle="idle"})(Ac||(Ac={}));(function(t){t.server="server",t.session="session",t.kernel="kernel",t.notebook="notebook",t.cell="cell"})(Ao||(Ao={}));(function(t){t.warning="warning",t.executeError="execute-error",t.error="error",t.server="server-error",t.session="session-error"})(Zn||(Zn={}));(function(t){t.status="status",t.error="error"})(kf||(kf={}));zu=class{constructor(){this.listeners={}}_ensureMap(e){e in this.listeners||(this.listeners[e]=new Map)}trigger(e,i){e in this.listeners&&this.listeners[e].forEach(({unbind:n},r)=>{r(e,i),n&&this.listeners[e].delete(r)})}on(e,i){return this._ensureMap(e),this.listeners[e].set(i,{unbind:!1}),()=>this.off(e,i)}one(e,i){return this._ensureMap(e),this.listeners[e].set(i,{unbind:!0}),()=>this.off(e,i)}off(e,i){e in this.listeners&&this.listeners[e].delete(i)}}});function*Df(){}function qS(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function iz(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function nz(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*rz(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var je,tz,Nf,VS=$(()=>{(function(t){function e(L,S,x=0,w=-1){let E=L.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Q>>1,K=B+X;x(L[K],S)<0?(B=K+1,Q-=X+1):Q=X}return B}t.lowerBound=a;function l(L,S,x,w=0,E=-1){let N=L.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Q=E-w+1;for(;Q>0;){let X=Q>>1,K=B+X;x(L[K],S)>0?Q=X:(B=K+1,Q-=X+1)}return B}t.upperBound=l;function c(L,S,x){if(L===S)return!0;if(L.length!==S.length)return!1;for(let w=0,E=L.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Q=[];for(let X=0;X=w))return;let N=w-x+1;if(S>0?S=S%N:S<0&&(S=(S%N+N)%N),S===0)return;let B=x+S;f(L,x,B-1),f(L,B,w),f(L,x,w)}t.rotate=h;function m(L,S,x=0,w=-1){let E=L.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wS;--E)L[E]=L[E-1];L[S]=x}t.insert=p;function v(L,S){let x=L.length;if(S<0&&(S+=x),S<0||S>=x)return;let w=L[S];for(let E=S+1;E=x&&B<=w&&L[B]===S||w=x)&&L[B]===S?N++:N>0&&(L[B-N]=L[B]);return N>0&&(L.length=E-N),N}t.removeAllOf=M;function O(L,S,x=0,w=-1){let E,N=n(L,S,x,w);return N!==-1&&(E=v(L,N)),{index:N,value:E}}t.removeFirstWhere=O;function R(L,S,x=-1,w=0){let E,N=r(L,S,x,w);return N!==-1&&(E=v(L,N)),{index:N,value:E}}t.removeLastWhere=R;function _(L,S,x=0,w=-1){let E=L.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&S(L[B],B)||w=x)&&S(L[B],B)?N++:N>0&&(L[B-N]=L[B]);return N>0&&(L.length=E-N),N}t.removeAllWhere=_})(je||(je={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(Nf||(Nf={}))});var sz,oi,Bu,Fu,To,oz=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(sz||(sz={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,v=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:v}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let v=u.match(p);return v===null?!1:(u=u.slice(v[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(To||(To={}))});var Of,fl,az=$(()=>{Of=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new fl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new fl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof fl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new fl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof fl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new fl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof fl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(Of||(Of={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(fl||(fl={}))});var Sr,Tc,De,S0=$(()=>{VS();az();Sr=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},Tc=class extends Sr{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(S=>x=>{let w=!1;return S.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(S,x){let w=f.get(S);if(!w||w.length===0){y(S,x);return}iz(rz(w),N=>N?v(N,S,x):!0)&&y(S,x)}t.sendMessage=n;function r(S,x){if(!x.isConflatable){C(S,x);return}nz(d,E=>E.handler!==S||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||C(S,x)}t.postMessage=r;function s(S,x){let w=f.get(S);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(S,[x]))}t.installMessageHook=s;function o(S,x){let w=f.get(S);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(S){let x=f.get(S);x&&x.length>0&&(je.fill(x,null),O(x));for(let w of d)w.handler===S&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,M(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(S){let x=m;return m=S,x}t.setExceptionHandler=u;let d=new Of,f=new WeakMap,h=new Set,m=S=>{console.error(S)},p=!1;function v(S,x,w){let E=!0;try{typeof S=="function"?E=S(x,w):E=S.messageHook(x,w)}catch(N){m(N)}return E}function y(S,x){try{S.processMessage(x)}catch(w){m(w)}}function C(S,x){d.addLast({handler:S,msg:x}),e===null&&(e=i(M))}function M(){if(e=null,d.isEmpty)return;let S={handler:null,msg:null};for(d.addLast(S);;){let x=d.removeFirst();if(x===S)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(S){h.size===0&&i(R),h.add(S)}function R(){h.forEach(_),h.clear()}function _(S){je.removeAllWhere(S,L)}function L(S){return S===null}})(De||(De={}))});var vl,pa,ps,_g,ge,E0,va,ju,Pf,zf,xg,yg,Ro,pl,GS,L0,I0,YS,Wu,KS,wg,XS,gs,Hu,M0,JS,Bf,ml,ga,Er,lz,NK,Rc,io,ZS,ln,$u,Ji,gl,In,Ff,A0,cz,uz,QS,dz,hz,tE=$(()=>{VS();vl=P(cr());oz();S0();qp();Hs();Yb();Xb();Jb();Gb();Kb();pa=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=v.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let v=p,y=d;for(let C=0;C0&&p>h;){let v=p/m;for(let y=0;y0&&p>h;){let v=p,y=d;for(let C=0;C=M.maxSize?(p-=M.maxSize-M.size,d-=M.stretch,M.size=M.maxSize,M.done=!0,m--,f--):(p-=O,M.size+=O)}}for(;m>0&&p>h;){let v=p/m;for(let y=0;y=C.maxSize?(p-=C.maxSize-C.size,C.size=C.maxSize,C.done=!0,m--):(p-=v,C.size+=v))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(ps||(ps={}));_g=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},ge=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=ge.HiddenMode.Display,this.node=E0.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(ge.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&ge.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),De.clearData(this),bt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(ge.Flag.IsDisposed)}get isAttached(){return this.testFlag(ge.Flag.IsAttached)}get isHidden(){return this.testFlag(ge.Flag.IsHidden)}get isVisible(){return this.testFlag(ge.Flag.IsVisible)}get title(){return E0.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==ge.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new ge.ChildMessage("child-removed",this);De.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new ge.ChildMessage("child-added",this);De.sendMessage(this._parent,i)}this.isDisposed||De.sendMessage(this,ge.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(ge.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){De.postMessage(this,ge.Msg.UpdateRequest)}fit(){De.postMessage(this,ge.Msg.FitRequest)}activate(){De.postMessage(this,ge.Msg.ActivateRequest)}close(){De.sendMessage(this,ge.Msg.CloseRequest)}show(){if(this.testFlag(ge.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&De.sendMessage(this,ge.Msg.BeforeShow),this.clearFlag(ge.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&De.sendMessage(this,ge.Msg.AfterShow),this.parent)){let e=new ge.ChildMessage("child-shown",this);De.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(ge.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&De.sendMessage(this,ge.Msg.BeforeHide),this.setFlag(ge.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&De.sendMessage(this,ge.Msg.AfterHide),this.parent)){let e=new ge.ChildMessage("child-hidden",this);De.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(ge.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(ge.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(ge.Flag.IsVisible),this.setFlag(ge.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(ge.Flag.IsVisible),this.clearFlag(ge.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&ge.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case ge.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case ge.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case ge.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case ge.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case ge.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case ge.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new Sr("before-show"),s.AfterShow=new Sr("after-show"),s.BeforeHide=new Sr("before-hide"),s.AfterHide=new Sr("after-hide"),s.BeforeAttach=new Sr("before-attach"),s.AfterAttach=new Sr("after-attach"),s.BeforeDetach=new Sr("before-detach"),s.AfterDetach=new Sr("after-detach"),s.ParentChanged=new Sr("parent-changed"),s.UpdateRequest=new Tc("update-request"),s.FitRequest=new Tc("fit-request"),s.ActivateRequest=new Tc("activate-request"),s.CloseRequest=new Tc("close-request")}(t.Msg||(t.Msg={}));class e extends Sr{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends Sr{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");De.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),De.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");De.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),De.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(ge||(ge={}));(function(t){t.titleProperty=new bt({name:"title",create:i=>new _g({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(E0||(E0={}));va=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),bt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)De.sendMessage(i,ge.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)De.sendMessage(i,ge.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)De.sendMessage(i,e)}onAfterAttach(e){for(let i of this)De.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)De.sendMessage(i,e)}onAfterDetach(e){for(let i of this)De.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||De.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||De.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||De.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||De.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return Pf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){Pf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return Pf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){Pf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(va||(va={}));ju=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=oi.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new bt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(Pf||(Pf={}));zf=class extends va{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){je.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(je.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=je.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&De.sendMessage(n,ge.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&De.sendMessage(n,ge.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&De.sendMessage(n,ge.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&De.sendMessage(n,ge.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(xg||(xg={}));yg=xg,Ro=class extends zf{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=xg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=xg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return pl.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);ps.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new ju(i),r=pl.createHandle(this.renderer),s=pl.averageSize(this._sizers),o=pl.createSizer(s);je.insert(this._items,e,n),je.insert(this._sizers,e,o),je.insert(this._handles,e,r),this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),je.move(this._sizers,e,i),je.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=je.removeAt(this._items,e),r=je.removeAt(this._handles,e);je.removeAt(this._sizers,e),this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=Ro.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=oi.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&De.sendMessage(this.parent.parent,ge.Msg.FitRequest),this._dirty&&De.sendMessage(this.parent,ge.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=ps.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new pa;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof Ro&&o.parent.fit()}})(pl||(pl={}));GS=class extends Ro{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=yg.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=L0.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${vl.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=L0.createTitle(this.renderer,i.title);je.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){je.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=je.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(L0||(L0={}));I0=class extends ge{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=YS.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new zf}t.createLayout=e})(YS||(YS={}));Wu=class extends I0{constructor(e={}){super({layout:KS.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=je.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=mn.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return Ro.getStretch(r)}t.getStretch=i;function n(r,s){Ro.setStretch(r,s)}t.setStretch=n})(Wu||(Wu={}));(function(t){function e(i){return i.layout||new Ro({renderer:i.renderer||Wu.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(KS||(KS={}));wg=class extends Wu{constructor(e={}){super({...e,layout:XS.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=je.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=je.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=je.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Wu.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(wg||(wg={}));(function(t){function e(i){return i.layout||new GS({renderer:i.renderer||wg.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(XS||(XS={}));gs=class extends zf{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=yg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=yg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){je.insert(this._items,e,new ju(i)),je.insert(this._sizers,e,new pa),this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),je.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=je.removeAt(this._items,e);je.removeAt(this._sizers,e),this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new bt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof gs&&r.parent.fit()}})(Hu||(Hu={}));M0=class extends I0{constructor(e={}){super({layout:JS.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return gs.getStretch(s)}t.getStretch=e;function i(s,o){gs.setStretch(s,o)}t.setStretch=i;function n(s){return gs.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){gs.setSizeBasis(s,o)}t.setSizeBasis=r})(M0||(M0={}));(function(t){function e(i){return i.layout||new gs(i)}t.createLayout=e})(JS||(JS={}));Bf=class extends ge{constructor(e){super({node:ml.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(ge.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||Bf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=ml.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>ml.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){je.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=ml.search(this._items,i),this._activeIndex=i?je.findFirstIndex(r,ml.canActivate):-1),!i&&r.length===0){ri.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});ri.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];oi.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=je.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eS-x),_=C.slice(0,R),L=C.slice(R);for(let S=0,x=L.length;Sp.command===h&&vl.JSONExt.deepEqual(p.args,m))||null}}})(ml||(ml={}));ga=class extends ge{constructor(e){super({node:Er.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(ge.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||ga.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!Er.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}ri.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=js().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=Er.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=je.findFirstIndex(this.contentNode.children,r=>oi.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(oi.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(Er.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;ga.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,De.sendMessage(this,ge.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];Er.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},Er.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},Er.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){Er.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Le.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Le.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Le.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Le.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Le.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Le.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?ur.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(ga||(ga={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),v=document.createElement("ul");return v.className="lm-Menu-content",p.appendChild(v),v.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,v){return new m(p.commands,v)}t.createItem=a;function l(p,v,y){for(let C=p;C;C=C.childMenu)if(oi.hitTest(C.node,v,y))return!0;return!1}t.hitTestMenus=l;function c(p){let v=new Array(p.length);je.fill(v,!1);let y=0,C=p.length;for(;y=0;--M){let R=p[M];if(R.isVisible){if(R.type!=="separator")break;v[M]=!0}}let O=!1;for(;++yL+x&&(v=L+x-Q),!M&&y+X>S+w&&(y>S+w?y=S+w-X:y=y-X),B.transform=`translate(${Math.max(0,v)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,v){let y=n(),C=y.pageXOffset,M=y.pageYOffset,O=y.clientWidth,R=y.clientHeight;De.sendMessage(p,ge.Msg.UpdateRequest);let _=R,L=p.node,S=L.style;S.opacity="0",S.maxHeight=`${_}px`,ge.attach(p,document.body);let{width:x,height:w}=L.getBoundingClientRect(),E=oi.boxSizing(p.node),N=v.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>C+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Q=N.top-E.borderTop-E.paddingTop;Q+w>M+R&&(Q=N.bottom+E.borderBottom+E.paddingBottom-w),S.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Q)}px`,S.opacity="1"}t.openSubmenu=f;function h(p,v,y){let C=-1,M=-1,O=!1,R=v.toUpperCase();for(let _=0,L=p.length;_=0&&EC.command===v&&vl.JSONExt.deepEqual(C.args,y))||null}return null}}})(Er||(Er={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,v=h.length;p=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=io.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(je.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(je.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=je.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;loi.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=je.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(NK.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=je.findFirstIndex(n,o=>oi.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!io.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=io.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=mn.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&io.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}io.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=je.findFirstIndex(s,c=>oi.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;io.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=io.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,io.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(je.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),De.sendMessage(this,ge.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(io.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Le.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Le.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Le.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Le.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Le.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(Rc||(Rc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof _g?u:new _g(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,v,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,v=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,v=f.clientY,y=d.contentRect.height);let C=d.index,M=p-d.tabPressPos,O=M+d.tabSize;for(let R=0,_=u.length;R<_;++R){let L,S=d.tabLayout[R],x=S.pos+(S.size>>1);if(Rd.index&&O>x)L=`${-d.tabSize-S.margin}px`,C=Math.max(C,R);else if(R===d.index){let w=v-m,E=y-(d.tabPos+d.tabSize);L=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else L="";h==="horizontal"?u[R].style.left=L:u[R].style.top=L}d.targetIndex=C}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let v=u.tabLayout[u.targetIndex];h=v.pos+v.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(io||(io={}));ZS=class extends va{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=yg.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:ge.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=yg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():Df()}widgets(){return this._root?this._root.iterUserWidgets():Df()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():Df()}tabBars(){return this._root?this._root.iterTabBars():Df()}handles(){return this._root?this._root.iterHandles():Df()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),ps.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=ln.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=ln.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=oi.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new ju(e)),this.parent.isAttached&&De.sendMessage(e,ge.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&De.sendMessage(e,ge.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&De.sendMessage(e,ge.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&De.sendMessage(e,ge.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(ln.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===ge.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=ge.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=je.removeFirstOf(n.children,i),s=je.removeAt(n.handles,r);if(je.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof ln.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=je.removeAt(c.handles,u);je.removeAt(c.children,u),je.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,v=0,y=1/0,C=1/0,M=m.get(this.tabBar),O=this.tabBar.currentTitle,R=O?m.get(O.owner):void 0,[_,L]=this.sizers;return M&&M.fit(),R&&R.fit(),M&&!M.isHidden?(p=Math.max(p,M.minWidth),v+=M.minHeight,_.minSize=M.minHeight,_.maxSize=M.maxHeight):(_.minSize=0,_.maxSize=0),R&&!R.isHidden?(p=Math.max(p,R.minWidth),v+=R.minHeight,L.minSize=R.minHeight,L.maxSize=1/0):(L.minSize=0,L.maxSize=1/0),{minWidth:p,minHeight:v,maxWidth:y,maxHeight:C}}update(h,m,p,v,y,C){this._top=m,this._left=h,this._width=p,this._height=v;let M=C.get(this.tabBar),O=this.tabBar.currentTitle,R=O?C.get(O.owner):void 0;if(ps.calc(this.sizers,v),M&&!M.isHidden){let _=this.sizers[0].size;M.update(h,m,p,_),m+=_}if(R&&!R.isHidden){let _=this.sizers[1].size;R.update(h,m,p,_)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;mv.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,v)=>p+v.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(v=>v.size),p=m.reduce((v,y)=>v+y,0);if(p===0)for(let v=m.length-1;v>-1;v--)m[v]=1/h;else for(let v=m.length-1;v>-1;v--)m[v]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",v=Math.max(0,this.children.length-1)*h,y=p?v:0,C=p?0:v,M=1/0,O=1/0;for(let R=0,_=this.children.length;R<_;++R){let L=this.children[R].fit(h,m);p?(C=Math.max(C,L.minHeight),y+=L.minWidth,this.sizers[R].minSize=L.minWidth):(y=Math.max(y,L.minWidth),C+=L.minHeight,this.sizers[R].minSize=L.minHeight)}return{minWidth:y,minHeight:C,maxWidth:M,maxHeight:O}}update(h,m,p,v,y,C){let M=this.orientation==="horizontal",O=Math.max(0,this.children.length-1)*y,R=Math.max(0,(M?p:v)-O);if(this.normalized){for(let _ of this.sizers)_.sizeHint*=R;this.normalized=!1}ps.calc(this.sizers,R);for(let _=0,L=this.children.length;_=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],v=[];for(let y=0,C=f.children.length;y{let C=n(v,h,m),M=e(f.sizes[y]),O=h.createHandle();p.children.push(C),p.handles.push(O),p.sizers.push(M),C.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(ln||(ln={}));$u=class extends ge{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||$u.defaultRenderer,this._edges=e.edges||Ji.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new ZS({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new $u.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Ji.createSingleDocumentConfig(this));break;default:throw"unreachable"}De.postMessage(this,Ji.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=qS(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(Bu.IS_EDGE||Bu.IS_IE)&&De.flush(),De.postMessage(this,Ji.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),De.postMessage(this,Ji.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Ji.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Ji.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),De.postMessage(this,Ji.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Ji.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof ge)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Ji.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),De.postMessage(this,Ji.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=qS(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=mn.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),De.postMessage(this,Ji.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Ji.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=oi.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Ji.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Ji.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Ji.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Ji.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Ji.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){De.postMessage(this,Ji.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(Bu.IS_EDGE||Bu.IS_IE)&&De.flush(),De.postMessage(this,Ji.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new vl.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new mn({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new Rc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})($u||($u={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new Tc("layout-modified"),t.isGeneratedTabBarProperty=new bt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!oi.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let M=r.node.getBoundingClientRect(),O=s-M.left+1,R=o-M.top+1,_=M.right-s,L=M.bottom-o;switch(Math.min(R,_,L,O)){case R:if(Rp&&f>p&&d>v&&h>v)return{zone:"widget-all",target:c};u/=p,d/=v,f/=p,h/=v;let y=Math.min(u,d,f,h),C;switch(y){case u:C="widget-left";break;case d:C="widget-top";break;case f:C="widget-right";break;case h:C="widget-bottom";break;default:throw"unreachable"}return{zone:C,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Ji||(Ji={}));gl=class extends va{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new pa],this._columnSizers=[new pa],this._box=null,e.rowCount!==void 0&&In.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&In.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=In.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=In.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(In.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(In.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=In.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=In.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=In.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=In.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){je.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new ju(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=je.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=je.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&De.sendMessage(e,ge.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&De.sendMessage(e,ge.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&De.sendMessage(e,ge.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&De.sendMessage(e,ge.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof gl&&l.parent.fit()}})(In||(In={}));Ff=class extends ge{constructor(e={}){super({node:A0.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(ge.Flag.DisallowLayout),this.renderer=e.renderer||Ff.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){je.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(je.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=je.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new ga({commands:new ur}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let v=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,v,!1),u[a]=r.renderItem({title:v.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}ri.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;coi.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);ga.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=je.findFirstIndex(this.contentNode.children,r=>oi.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;ga.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,De.sendMessage(this,ge.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Le.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Le.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Le.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Le.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Ff||(Ff={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===ge.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=ge.HiddenMode.Scale),i.hiddenMode=ge.HiddenMode.Scale):i.hiddenMode=ge.HiddenMode.Display,je.insert(this._items,e,new ju(i)),this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=je.removeAt(this._items,e);this.parent.isAttached&&De.sendMessage(i,ge.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&De.sendMessage(i,ge.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===ge.HiddenMode.Scale&&(i.hiddenMode=ge.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=ge.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{});var iE,T0,Hf,mz=$(()=>{i1();iE=P(cr());Hs();T0=class{constructor(e){this.trusted=!1,this._changed=new Te(this),this._raw={};let i=Hf.getData(e.value);this._data=new Gp({values:i}),this._rawData=i;let n=e.value;for(let r in n)switch(r){case"data":break;default:this._raw[r]=Hf.extract(n,r)}}get changed(){return this._changed}dispose(){this._data.dispose(),Te.clearData(this)}get data(){return this._rawData}get metadata(){return{}}setData(e){e.data&&(this._updateObservable(this._data,e.data),this._rawData=e.data),this._changed.emit(void 0)}toJSON(){let e={};for(let i in this._raw)e[i]=Hf.extract(this._raw,i);return e}_updateObservable(e,i){let n=e.keys(),r=Object.keys(i);for(let s of n)r.indexOf(s)===-1&&e.delete(s);for(let s of r){let o=e.get(s),a=i[s];o!==a&&e.set(s,a)}}};(function(t){function e(i){return Hf.getData(i)}t.getData=e})(T0||(T0={}));(function(t){function e(s){return r(s)}t.getData=e;function i(s){return{data:e(s.value)}}t.getBundleOptions=i;function n(s,o){let a=s[o];return a===void 0||iE.JSONExt.isPrimitive(a)?a:iE.JSONExt.deepCopy(a)}t.extract=n;function r(s){let o=Object.create(null);for(let a in s)o[a]=n(s,a);return o}})(Hf||(Hf={}))});function*Wf(){}function nE(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function gz(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function vz(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*bz(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var We,pz,jf,rE=$(()=>{(function(t){function e(L,S,x=0,w=-1){let E=L.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Q>>1,K=B+X;x(L[K],S)<0?(B=K+1,Q-=X+1):Q=X}return B}t.lowerBound=a;function l(L,S,x,w=0,E=-1){let N=L.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Q=E-w+1;for(;Q>0;){let X=Q>>1,K=B+X;x(L[K],S)>0?Q=X:(B=K+1,Q-=X+1)}return B}t.upperBound=l;function c(L,S,x){if(L===S)return!0;if(L.length!==S.length)return!1;for(let w=0,E=L.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Q=[];for(let X=0;X=w))return;let N=w-x+1;if(S>0?S=S%N:S<0&&(S=(S%N+N)%N),S===0)return;let B=x+S;f(L,x,B-1),f(L,B,w),f(L,x,w)}t.rotate=h;function m(L,S,x=0,w=-1){let E=L.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wS;--E)L[E]=L[E-1];L[S]=x}t.insert=p;function v(L,S){let x=L.length;if(S<0&&(S+=x),S<0||S>=x)return;let w=L[S];for(let E=S+1;E=x&&B<=w&&L[B]===S||w=x)&&L[B]===S?N++:N>0&&(L[B-N]=L[B]);return N>0&&(L.length=E-N),N}t.removeAllOf=M;function O(L,S,x=0,w=-1){let E,N=n(L,S,x,w);return N!==-1&&(E=v(L,N)),{index:N,value:E}}t.removeFirstWhere=O;function R(L,S,x=-1,w=0){let E,N=r(L,S,x,w);return N!==-1&&(E=v(L,N)),{index:N,value:E}}t.removeLastWhere=R;function _(L,S,x=0,w=-1){let E=L.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&S(L[B],B)||w=x)&&S(L[B],B)?N++:N>0&&(L[B-N]=L[B]);return N>0&&(L.length=E-N),N}t.removeAllWhere=_})(We||(We={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(jf||(jf={}))});var _z,ai,Uu,qu,ko,xz=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(_z||(_z={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,v=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:v}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let v=u.match(p);return v===null?!1:(u=u.slice(v[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(ko||(ko={}))});var $f,bl,yz=$(()=>{$f=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new bl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new bl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof bl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new bl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof bl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new bl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof bl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})($f||($f={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(bl||(bl={}))});var Lr,kc,Be,wz=$(()=>{rE();yz();Lr=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},kc=class extends Lr{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(S=>x=>{let w=!1;return S.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(S,x){let w=f.get(S);if(!w||w.length===0){y(S,x);return}gz(bz(w),N=>N?v(N,S,x):!0)&&y(S,x)}t.sendMessage=n;function r(S,x){if(!x.isConflatable){C(S,x);return}vz(d,E=>E.handler!==S||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||C(S,x)}t.postMessage=r;function s(S,x){let w=f.get(S);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(S,[x]))}t.installMessageHook=s;function o(S,x){let w=f.get(S);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(S){let x=f.get(S);x&&x.length>0&&(We.fill(x,null),O(x));for(let w of d)w.handler===S&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,M(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(S){let x=m;return m=S,x}t.setExceptionHandler=u;let d=new $f,f=new WeakMap,h=new Set,m=S=>{console.error(S)},p=!1;function v(S,x,w){let E=!0;try{typeof S=="function"?E=S(x,w):E=S.messageHook(x,w)}catch(N){m(N)}return E}function y(S,x){try{S.processMessage(x)}catch(w){m(w)}}function C(S,x){d.addLast({handler:S,msg:x}),e===null&&(e=i(M))}function M(){if(e=null,d.isEmpty)return;let S={handler:null,msg:null};for(d.addLast(S);;){let x=d.removeFirst();if(x===S)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(S){h.size===0&&i(R),h.add(S)}function R(){h.forEach(_),h.clear()}function _(S){We.removeAllWhere(S,L)}function L(S){return S===null}})(Be||(Be={}))});var wl,ba,vs,Cg,ve,R0,xa,Gu,Uf,qf,Sg,Eg,No,xl,sE,k0,N0,oE,Yu,aE,Lg,lE,bs,Vu,D0,cE,Vf,_l,_a,Ir,Cz,OK,Nc,no,uE,cn,Ku,Zi,yl,Mn,Gf,O0,Sz,Ez,dE,Lz,Iz,Mz=$(()=>{rE();wl=P(cr());xz();wz();qp();Hs();Yb();Xb();Jb();Gb();Kb();ba=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=v.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let v=p,y=d;for(let C=0;C0&&p>h;){let v=p/m;for(let y=0;y0&&p>h;){let v=p,y=d;for(let C=0;C=M.maxSize?(p-=M.maxSize-M.size,d-=M.stretch,M.size=M.maxSize,M.done=!0,m--,f--):(p-=O,M.size+=O)}}for(;m>0&&p>h;){let v=p/m;for(let y=0;y=C.maxSize?(p-=C.maxSize-C.size,C.size=C.maxSize,C.done=!0,m--):(p-=v,C.size+=v))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(vs||(vs={}));Cg=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},ve=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=ve.HiddenMode.Display,this.node=R0.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(ve.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&ve.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),Be.clearData(this),bt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(ve.Flag.IsDisposed)}get isAttached(){return this.testFlag(ve.Flag.IsAttached)}get isHidden(){return this.testFlag(ve.Flag.IsHidden)}get isVisible(){return this.testFlag(ve.Flag.IsVisible)}get title(){return R0.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==ve.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new ve.ChildMessage("child-removed",this);Be.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new ve.ChildMessage("child-added",this);Be.sendMessage(this._parent,i)}this.isDisposed||Be.sendMessage(this,ve.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(ve.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){Be.postMessage(this,ve.Msg.UpdateRequest)}fit(){Be.postMessage(this,ve.Msg.FitRequest)}activate(){Be.postMessage(this,ve.Msg.ActivateRequest)}close(){Be.sendMessage(this,ve.Msg.CloseRequest)}show(){if(this.testFlag(ve.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Be.sendMessage(this,ve.Msg.BeforeShow),this.clearFlag(ve.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&Be.sendMessage(this,ve.Msg.AfterShow),this.parent)){let e=new ve.ChildMessage("child-shown",this);Be.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(ve.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Be.sendMessage(this,ve.Msg.BeforeHide),this.setFlag(ve.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&Be.sendMessage(this,ve.Msg.AfterHide),this.parent)){let e=new ve.ChildMessage("child-hidden",this);Be.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(ve.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(ve.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(ve.Flag.IsVisible),this.setFlag(ve.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(ve.Flag.IsVisible),this.clearFlag(ve.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&ve.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case ve.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case ve.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case ve.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case ve.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case ve.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case ve.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new Lr("before-show"),s.AfterShow=new Lr("after-show"),s.BeforeHide=new Lr("before-hide"),s.AfterHide=new Lr("after-hide"),s.BeforeAttach=new Lr("before-attach"),s.AfterAttach=new Lr("after-attach"),s.BeforeDetach=new Lr("before-detach"),s.AfterDetach=new Lr("after-detach"),s.ParentChanged=new Lr("parent-changed"),s.UpdateRequest=new kc("update-request"),s.FitRequest=new kc("fit-request"),s.ActivateRequest=new kc("activate-request"),s.CloseRequest=new kc("close-request")}(t.Msg||(t.Msg={}));class e extends Lr{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends Lr{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");Be.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),Be.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");Be.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),Be.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(ve||(ve={}));(function(t){t.titleProperty=new bt({name:"title",create:i=>new Cg({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(R0||(R0={}));xa=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),bt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)Be.sendMessage(i,ve.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)Be.sendMessage(i,ve.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)Be.sendMessage(i,e)}onAfterAttach(e){for(let i of this)Be.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)Be.sendMessage(i,e)}onAfterDetach(e){for(let i of this)Be.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||Be.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||Be.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||Be.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||Be.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return Uf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){Uf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return Uf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){Uf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(xa||(xa={}));Gu=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ai.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new bt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(Uf||(Uf={}));qf=class extends xa{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){We.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(We.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=We.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&Be.sendMessage(n,ve.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&Be.sendMessage(n,ve.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&Be.sendMessage(n,ve.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&Be.sendMessage(n,ve.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(Sg||(Sg={}));Eg=Sg,No=class extends qf{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=Sg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=Sg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return xl.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);vs.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new Gu(i),r=xl.createHandle(this.renderer),s=xl.averageSize(this._sizers),o=xl.createSizer(s);We.insert(this._items,e,n),We.insert(this._sizers,e,o),We.insert(this._handles,e,r),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){We.move(this._items,e,i),We.move(this._sizers,e,i),We.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=We.removeAt(this._items,e),r=We.removeAt(this._handles,e);We.removeAt(this._sizers,e),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=No.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ai.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&Be.sendMessage(this.parent.parent,ve.Msg.FitRequest),this._dirty&&Be.sendMessage(this.parent,ve.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=vs.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new ba;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof No&&o.parent.fit()}})(xl||(xl={}));sE=class extends No{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=Eg.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=k0.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${wl.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=k0.createTitle(this.renderer,i.title);We.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){We.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=We.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(k0||(k0={}));N0=class extends ve{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=oE.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new qf}t.createLayout=e})(oE||(oE={}));Yu=class extends N0{constructor(e={}){super({layout:aE.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=We.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=mn.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return No.getStretch(r)}t.getStretch=i;function n(r,s){No.setStretch(r,s)}t.setStretch=n})(Yu||(Yu={}));(function(t){function e(i){return i.layout||new No({renderer:i.renderer||Yu.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(aE||(aE={}));Lg=class extends Yu{constructor(e={}){super({...e,layout:lE.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=We.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=We.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=We.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Yu.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(Lg||(Lg={}));(function(t){function e(i){return i.layout||new sE({renderer:i.renderer||Lg.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(lE||(lE={}));bs=class extends qf{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=Eg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=Eg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){We.insert(this._items,e,new Gu(i)),We.insert(this._sizers,e,new ba),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){We.move(this._items,e,i),We.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=We.removeAt(this._items,e);We.removeAt(this._sizers,e),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new bt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof bs&&r.parent.fit()}})(Vu||(Vu={}));D0=class extends N0{constructor(e={}){super({layout:cE.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return bs.getStretch(s)}t.getStretch=e;function i(s,o){bs.setStretch(s,o)}t.setStretch=i;function n(s){return bs.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){bs.setSizeBasis(s,o)}t.setSizeBasis=r})(D0||(D0={}));(function(t){function e(i){return i.layout||new bs(i)}t.createLayout=e})(cE||(cE={}));Vf=class extends ve{constructor(e){super({node:_l.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(ve.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||Vf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=_l.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>_l.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){We.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=_l.search(this._items,i),this._activeIndex=i?We.findFirstIndex(r,_l.canActivate):-1),!i&&r.length===0){ri.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});ri.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ai.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=We.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eS-x),_=C.slice(0,R),L=C.slice(R);for(let S=0,x=L.length;Sp.command===h&&wl.JSONExt.deepEqual(p.args,m))||null}}})(_l||(_l={}));_a=class extends ve{constructor(e){super({node:Ir.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(ve.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||_a.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!Ir.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}ri.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=js().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=Ir.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=We.findFirstIndex(this.contentNode.children,r=>ai.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ai.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(Ir.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;_a.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,Be.sendMessage(this,ve.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];Ir.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},Ir.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},Ir.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){Ir.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Le.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Le.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Le.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Le.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Le.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Le.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?ur.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(_a||(_a={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),v=document.createElement("ul");return v.className="lm-Menu-content",p.appendChild(v),v.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,v){return new m(p.commands,v)}t.createItem=a;function l(p,v,y){for(let C=p;C;C=C.childMenu)if(ai.hitTest(C.node,v,y))return!0;return!1}t.hitTestMenus=l;function c(p){let v=new Array(p.length);We.fill(v,!1);let y=0,C=p.length;for(;y=0;--M){let R=p[M];if(R.isVisible){if(R.type!=="separator")break;v[M]=!0}}let O=!1;for(;++yL+x&&(v=L+x-Q),!M&&y+X>S+w&&(y>S+w?y=S+w-X:y=y-X),B.transform=`translate(${Math.max(0,v)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,v){let y=n(),C=y.pageXOffset,M=y.pageYOffset,O=y.clientWidth,R=y.clientHeight;Be.sendMessage(p,ve.Msg.UpdateRequest);let _=R,L=p.node,S=L.style;S.opacity="0",S.maxHeight=`${_}px`,ve.attach(p,document.body);let{width:x,height:w}=L.getBoundingClientRect(),E=ai.boxSizing(p.node),N=v.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>C+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Q=N.top-E.borderTop-E.paddingTop;Q+w>M+R&&(Q=N.bottom+E.borderBottom+E.paddingBottom-w),S.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Q)}px`,S.opacity="1"}t.openSubmenu=f;function h(p,v,y){let C=-1,M=-1,O=!1,R=v.toUpperCase();for(let _=0,L=p.length;_=0&&EC.command===v&&wl.JSONExt.deepEqual(C.args,y))||null}return null}}})(Ir||(Ir={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,v=h.length;p=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=no.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(We.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(We.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=We.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lai.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=We.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(OK.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=We.findFirstIndex(n,o=>ai.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!no.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=no.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=mn.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&no.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}no.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=We.findFirstIndex(s,c=>ai.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;no.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=no.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,no.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(We.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),Be.sendMessage(this,ve.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(no.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Le.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Le.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Le.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Le.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Le.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(Nc||(Nc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof Cg?u:new Cg(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,v,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,v=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,v=f.clientY,y=d.contentRect.height);let C=d.index,M=p-d.tabPressPos,O=M+d.tabSize;for(let R=0,_=u.length;R<_;++R){let L,S=d.tabLayout[R],x=S.pos+(S.size>>1);if(Rd.index&&O>x)L=`${-d.tabSize-S.margin}px`,C=Math.max(C,R);else if(R===d.index){let w=v-m,E=y-(d.tabPos+d.tabSize);L=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else L="";h==="horizontal"?u[R].style.left=L:u[R].style.top=L}d.targetIndex=C}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let v=u.tabLayout[u.targetIndex];h=v.pos+v.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(no||(no={}));uE=class extends xa{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=Eg.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:ve.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=Eg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():Wf()}widgets(){return this._root?this._root.iterUserWidgets():Wf()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():Wf()}tabBars(){return this._root?this._root.iterTabBars():Wf()}handles(){return this._root?this._root.iterHandles():Wf()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),vs.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=cn.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=cn.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ai.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new Gu(e)),this.parent.isAttached&&Be.sendMessage(e,ve.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Be.sendMessage(e,ve.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&Be.sendMessage(e,ve.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Be.sendMessage(e,ve.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(cn.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===ve.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=ve.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=We.removeFirstOf(n.children,i),s=We.removeAt(n.handles,r);if(We.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof cn.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=We.removeAt(c.handles,u);We.removeAt(c.children,u),We.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,v=0,y=1/0,C=1/0,M=m.get(this.tabBar),O=this.tabBar.currentTitle,R=O?m.get(O.owner):void 0,[_,L]=this.sizers;return M&&M.fit(),R&&R.fit(),M&&!M.isHidden?(p=Math.max(p,M.minWidth),v+=M.minHeight,_.minSize=M.minHeight,_.maxSize=M.maxHeight):(_.minSize=0,_.maxSize=0),R&&!R.isHidden?(p=Math.max(p,R.minWidth),v+=R.minHeight,L.minSize=R.minHeight,L.maxSize=1/0):(L.minSize=0,L.maxSize=1/0),{minWidth:p,minHeight:v,maxWidth:y,maxHeight:C}}update(h,m,p,v,y,C){this._top=m,this._left=h,this._width=p,this._height=v;let M=C.get(this.tabBar),O=this.tabBar.currentTitle,R=O?C.get(O.owner):void 0;if(vs.calc(this.sizers,v),M&&!M.isHidden){let _=this.sizers[0].size;M.update(h,m,p,_),m+=_}if(R&&!R.isHidden){let _=this.sizers[1].size;R.update(h,m,p,_)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;mv.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,v)=>p+v.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(v=>v.size),p=m.reduce((v,y)=>v+y,0);if(p===0)for(let v=m.length-1;v>-1;v--)m[v]=1/h;else for(let v=m.length-1;v>-1;v--)m[v]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",v=Math.max(0,this.children.length-1)*h,y=p?v:0,C=p?0:v,M=1/0,O=1/0;for(let R=0,_=this.children.length;R<_;++R){let L=this.children[R].fit(h,m);p?(C=Math.max(C,L.minHeight),y+=L.minWidth,this.sizers[R].minSize=L.minWidth):(y=Math.max(y,L.minWidth),C+=L.minHeight,this.sizers[R].minSize=L.minHeight)}return{minWidth:y,minHeight:C,maxWidth:M,maxHeight:O}}update(h,m,p,v,y,C){let M=this.orientation==="horizontal",O=Math.max(0,this.children.length-1)*y,R=Math.max(0,(M?p:v)-O);if(this.normalized){for(let _ of this.sizers)_.sizeHint*=R;this.normalized=!1}vs.calc(this.sizers,R);for(let _=0,L=this.children.length;_=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],v=[];for(let y=0,C=f.children.length;y{let C=n(v,h,m),M=e(f.sizes[y]),O=h.createHandle();p.children.push(C),p.handles.push(O),p.sizers.push(M),C.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(cn||(cn={}));Ku=class extends ve{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Ku.defaultRenderer,this._edges=e.edges||Zi.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new uE({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Ku.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Zi.createSingleDocumentConfig(this));break;default:throw"unreachable"}Be.postMessage(this,Zi.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=nE(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(Uu.IS_EDGE||Uu.IS_IE)&&Be.flush(),Be.postMessage(this,Zi.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),Be.postMessage(this,Zi.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Zi.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Zi.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),Be.postMessage(this,Zi.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Zi.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof ve)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Zi.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),Be.postMessage(this,Zi.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=nE(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=mn.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),Be.postMessage(this,Zi.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Zi.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ai.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Zi.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Zi.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Zi.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Zi.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Zi.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){Be.postMessage(this,Zi.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(Uu.IS_EDGE||Uu.IS_IE)&&Be.flush(),Be.postMessage(this,Zi.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new wl.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new mn({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new Nc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Ku||(Ku={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new kc("layout-modified"),t.isGeneratedTabBarProperty=new bt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ai.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let M=r.node.getBoundingClientRect(),O=s-M.left+1,R=o-M.top+1,_=M.right-s,L=M.bottom-o;switch(Math.min(R,_,L,O)){case R:if(Rp&&f>p&&d>v&&h>v)return{zone:"widget-all",target:c};u/=p,d/=v,f/=p,h/=v;let y=Math.min(u,d,f,h),C;switch(y){case u:C="widget-left";break;case d:C="widget-top";break;case f:C="widget-right";break;case h:C="widget-bottom";break;default:throw"unreachable"}return{zone:C,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Zi||(Zi={}));yl=class extends xa{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new ba],this._columnSizers=[new ba],this._box=null,e.rowCount!==void 0&&Mn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&Mn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=Mn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=Mn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(Mn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(Mn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=Mn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=Mn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=Mn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=Mn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){We.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new Gu(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=We.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=We.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&Be.sendMessage(e,ve.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Be.sendMessage(e,ve.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&Be.sendMessage(e,ve.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Be.sendMessage(e,ve.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof yl&&l.parent.fit()}})(Mn||(Mn={}));Gf=class extends ve{constructor(e={}){super({node:O0.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(ve.Flag.DisallowLayout),this.renderer=e.renderer||Gf.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){We.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(We.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=We.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new _a({commands:new ur}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let v=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,v,!1),u[a]=r.renderItem({title:v.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}ri.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cai.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);_a.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=We.findFirstIndex(this.contentNode.children,r=>ai.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;_a.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,Be.sendMessage(this,ve.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Le.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Le.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Le.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Le.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Gf||(Gf={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===ve.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=ve.HiddenMode.Scale),i.hiddenMode=ve.HiddenMode.Scale):i.hiddenMode=ve.HiddenMode.Display,We.insert(this._items,e,new Gu(i)),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){We.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=We.removeAt(this._items,e);this.parent.isAttached&&Be.sendMessage(i,ve.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Be.sendMessage(i,ve.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===ve.HiddenMode.Scale&&(i.hiddenMode=ve.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=ve.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{var PK=1/0,zK="[object Symbol]",kz=/[&<>"'`]/g,BK=RegExp(kz.source),FK={"&":"&","<":"<",">":">",'"':""","'":"'","`":"`"},HK=typeof globalThis=="object"&&globalThis&&globalThis.Object===Object&&globalThis,jK=typeof self=="object"&&self&&self.Object===Object&&self,WK=HK||jK||Function("return this")();function $K(t){return function(e){return t?.[e]}}var UK=$K(FK),qK=Object.prototype,VK=qK.toString,Az=WK.Symbol,Tz=Az?Az.prototype:void 0,Rz=Tz?Tz.toString:void 0;function GK(t){if(typeof t=="string")return t;if(KK(t))return Rz?Rz.call(t):"";var e=t+"";return e=="0"&&1/t==-PK?"-0":e}function YK(t){return!!t&&typeof t=="object"}function KK(t){return typeof t=="symbol"||YK(t)&&VK.call(t)==zK}function XK(t){return t==null?"":GK(t)}function JK(t){return t=XK(t),t&&BK.test(t)?t.replace(kz,UK):t}Nz.exports=JK});function Pz(t){let e=[],i=null,n=null,r=null,s=0,o;t.includes("`")||t.includes("~~~")?(t=t.replace(/~/g,"~T").replace(/^(?`{3,}|(~T){3,})[^`\n]*\n([\s\S]*?)^\k`*$/gm,c=>c.replace(/\$/g,"~D")).replace(/(^|[^\\])(`+)([^\n]*?[^`\n])\2(?!`)/gm,c=>c.replace(/\$/g,"~D")),o=c=>c.replace(/~([TD])/g,(u,d)=>d==="T"?"~":Oz)):o=c=>c;let l=t.replace(/\r\n?/g,` -`).split(ZK);for(let c=1,u=l.length;c{let s=e[r];return s.substr(0,3)==="\\\\("&&s.substr(s.length-3)==="\\\\)"?s="\\("+s.substring(3,s.length-3)+"\\)":s.substr(0,3)==="\\\\["&&s.substr(s.length-3)==="\\\\]"&&(s="\\["+s.substring(3,s.length-3)+"\\]"),s};return t.replace(/@@(\d+)@@/g,i)}function hE(t,e,i,n,r){let s=r.slice(t,e+1).join("").replace(/&/g,"&").replace(//g,">");for(navigator&&navigator.appName==="Microsoft Internet Explorer"&&(s=s.replace(/(%[^\n]*)\n/g,`$1
-`));e>t;)r[e]="",e--;return r[t]="@@"+n.length+"@@",i&&(s=i(s)),n.push(s),r}var Oz,ZK,fE=$(()=>{Oz="$",ZK=/(\$\$?|\\(?:begin|end)\{[a-z]*\*?\}|\\[{}$]|[{}]|(?:\n\s*)+|@@\d+@@|\\\\(?:\(|\)|\[|\]))/i});function gE(t){let{host:e,source:i,trusted:n,sanitizer:r,resolver:s,linkHandler:o,shouldTypeset:a,latexTypesetter:l,translator:c}=t;c=c||Eo;let u=c?.load("jupyterlab"),d=i;if(!i)return e.textContent="",Promise.resolve(void 0);if(n||(d=`${i}`,i=r.sanitize(i)),e.innerHTML=i,e.getElementsByTagName("script").length>0)if(n)ya.evalInnerHTMLScriptTags(e);else{let h=document.createElement("div"),m=document.createElement("pre");m.textContent=u.__("This HTML output contains inline scripts. Are you sure that you want to run arbitrary Javascript within your JupyterLab session?");let p=document.createElement("button");p.textContent=u.__("Run"),p.onclick=v=>{e.innerHTML=d,ya.evalInnerHTMLScriptTags(e),e.firstChild&&e.removeChild(e.firstChild)},h.appendChild(m),h.appendChild(p),e.insertBefore(h,e.firstChild)}ya.handleDefaults(e,s);let f;return s?f=ya.handleUrls(e,s,o):f=Promise.resolve(void 0),f.then(()=>{a&&l&&l.typeset(e)})}function jz(t){let{host:e,mimeType:i,source:n,width:r,height:s,needsBackground:o,unconfined:a}=t;e.textContent="";let l=document.createElement("img");return l.src=`data:${i};base64,${n}`,typeof s=="number"&&(l.height=s),typeof r=="number"&&(l.width=r),o==="light"?l.classList.add("jp-needs-light-background"):o==="dark"&&l.classList.add("jp-needs-dark-background"),a===!0&&l.classList.add("jp-mod-unconfined"),e.appendChild(l),Promise.resolve(void 0)}function Wz(t){let{host:e,source:i,shouldTypeset:n,latexTypesetter:r}=t;return e.textContent=i,n&&r&&r.typeset(e),Promise.resolve(void 0)}async function Ig(t){let{host:e,source:i,markdownParser:n,...r}=t;if(!i){e.textContent="";return}let s="";if(n){let o=Pz(i);s=await n.render(o.text),s=zz(s,o.math)}else s=`
${i}
`;await gE({host:e,source:s,...r}),ya.headerAnchors(e)}function $z(t){let{host:e,source:i,trusted:n,unconfined:r}=t;if(!i)return e.textContent="",Promise.resolve(void 0);if(!n)return e.textContent="Cannot display an untrusted SVG. Maybe you need to run the cell?",Promise.resolve(void 0);let s="]+xmlns=[^>]+svg";i.search(s)<0&&(i=i.replace("{if(o>=i.length){n.push(document.createTextNode(s));return}let a=i[o],l,c=0,u=a.regex;for(u.lastIndex=0;(l=u.exec(s))!=null;){let f=s.substring(c,l.index);f&&r(f,o+1);let{path:h,...m}=l.groups,p=a.processPath?a.processPath(h):h,v=a.processLabel?a.processLabel(l[0]):l[0];n.push(a.createAnchor(p,v,m)),c=l.index+v.length}let d=s.substring(c);d&&r(d,o+1)};return r(t,0),n}function Bz(t,e){var i,n;let r=t.cloneNode();r.textContent=(i=t.textContent)===null||i===void 0?void 0:i.slice(0,e);let s=t.cloneNode();return s.textContent=(n=t.textContent)===null||n===void 0?void 0:n.slice(e),{pre:r,post:s}}function*Fz(t){var e;let i=0,n;for(let r of t)n=i+(((e=r.textContent)===null||e===void 0?void 0:e.length)||0),yield{node:r,start:i,end:n,isText:r.nodeType===Node.TEXT_NODE},i=n}function*QK(t,e){var i,n;let r=Fz(t),s=Fz(e),o=r.next(),a=s.next();for(;!o.done&&!a.done;){let l=o.value,c=a.value;if(l.isText&&l.start<=c.start&&l.end>=c.end)yield[null,c.node],a=s.next();else if(c.isText&&c.start<=l.start&&c.end>=l.end)yield[l.node,null],o=r.next();else if(l.end===c.end&&l.start===c.start)yield[l.node,c.node],o=r.next(),a=s.next();else if(l.end>c.end){let{pre:u,post:d}=Bz(l.node,c.end-l.start);c.startl.end){let{pre:u,post:d}=Bz(c.node,l.end-c.start);l.start{Yf=P(_k());Vp();Hz=P(Dz());fE();(function(t){function e(i){var n;return((n=i.textContent)!==null&&n!==void 0?n:"").replace(/ /g,"-")}t.createHeaderId=e})(Ig||(Ig={}));(function(t){let e="\\u0000-\\u0020\\u007f-\\u009f";t.webLinkRegex=new RegExp("(?(?:[a-zA-Z][a-zA-Z0-9+.-]{2,}:\\/\\/|data:|www\\.)[^\\s"+e+'"]{2,}[^\\s'+e+`"'(){}\\[\\],:;.!?])`,"ug");let i=/(?:[a-zA-Z]:(?:(?:\\|\/)[\w\.-]*)+)/,n=/(?:(?:\~|\.)(?:(?:\\|\/)[\w\.-]*)+)/,r=new RegExp(`(${i.source}|${n.source})`),s=/((?:\~|\.)?(?:\/[\w\.-]*)+)/,o=/(?:(?:\:|", line )(?[\d]+))?(?:\:(?[\d]+))?/,a=navigator.userAgent.indexOf("Windows")>=0;t.pathLinkRegex=new RegExp(`(?${a?r.source:s.source})${o.source}`,"g")})(P0||(P0={}));mE=class{constructor(){this.regex=P0.webLinkRegex}createAnchor(e,i){let n=document.createElement("a");return n.href=e.startsWith("www.")?"https://"+e:e,n.rel="noopener",n.target="_blank",n.appendChild(document.createTextNode(i)),n}processPath(e){let i=e.slice(-1),r=[">","<"].indexOf(i)!==-1?e.length-1:e.length;return e=e.slice(0,r),e}processLabel(e){return this.processPath(e)}},pE=class{constructor(){this.regex=P0.pathLinkRegex}createAnchor(e,i,n){let r=document.createElement("a");r.dataset.path=e;let s=parseInt(n.line,10),o=isNaN(s)?"":`line=${s-1}`;return r.dataset.locator=o,r.appendChild(document.createTextNode(i)),r}};(function(t){function e(h){let m=Array.from(h.getElementsByTagName("script"));for(let p of m){if(!p.parentNode)continue;let v=document.createElement("script"),y=p.attributes;for(let C=0,M=y.length;C{})}t.handleUrls=n;async function r(h,m,p){let v=h.getElementsByTagName("a");for(let y=0;y{let O=decodeURIComponent(M);return p&&p.handleLink(h,O,C),m.getDownloadUrl(M)}).then(M=>{h.href=M+C}).catch(M=>{h.href=""})}async function l(h,m,p){let v=h.dataset.path||"",y=h.dataset.locator?"#"+h.dataset.locator:"";delete h.dataset.path,delete h.dataset.locator;let C=!0,M=m.isLocal?m.isLocal(v,C):Yf.URLExt.isLocal(v,C);if(!v||!M||!m.resolvePath||!p||!p.handlePath)return h.replaceWith(...h.childNodes),Promise.resolve(void 0);try{let O=await m.resolvePath(v);if(!O)return console.log("Path resolution bailing: does not exist"),Promise.resolve(void 0);p.handlePath(h,O.path,O.scope,y),h.href=O.path+y}catch(O){console.warn("Path anchor error:",O),h.href="#linking-failed-see-console"}}let c=["ansi-black","ansi-red","ansi-green","ansi-yellow","ansi-blue","ansi-magenta","ansi-cyan","ansi-white","ansi-black-intense","ansi-red-intense","ansi-green-intense","ansi-yellow-intense","ansi-blue-intense","ansi-magenta-intense","ansi-cyan-intense","ansi-white-intense"];function u(h,m,p,v,y,C,M){if(h){let O=[],R=[];v&&typeof m=="number"&&0<=m&&m<8&&(m+=8),C&&([m,p]=[p,m]),typeof m=="number"?O.push(c[m]+"-fg"):m.length?R.push(`color: rgb(${m})`):C&&O.push("ansi-default-inverse-fg"),typeof p=="number"?O.push(c[p]+"-bg"):p.length?R.push(`background-color: rgb(${p})`):C&&O.push("ansi-default-inverse-bg"),v&&O.push("ansi-bold"),y&&O.push("ansi-underline"),O.length||R.length?(M.push(""),M.push(h),M.push("")):M.push(h)}}function d(h){let m,p,v,y=h.shift();if(y===2&&h.length>=3){if(m=h.shift(),p=h.shift(),v=h.shift(),[m,p,v].some(C=>C<0||255=1){let C=h.shift();if(C<0)throw new RangeError("Color index must be >= 0");if(C<16)return C;if(C<232)m=Math.floor((C-16)/36),m=m>0?55+m*40:0,p=Math.floor((C-16)%36/6),p=p>0?55+p*40:0,v=(C-16)%6,v=v>0?55+v*40:0;else if(C<256)m=p=v=(C-232)*10+8;else throw new RangeError("Color index must be < 256")}else throw new RangeError("Invalid extended color specification");return[m,p,v]}function f(h){let m=/\x1b\[(.*?)([@-~])/g,p=[],v=[],y=!1,C=!1,M=!1,O,R=[],_=[],L=0;for(h=(0,Hz.default)(h),h+="\x1B[m";O=m.exec(h);){if(O[2]==="m"){let x=O[1].split(";");for(let w=0;w{Vp();Mz();bE();Cl=class extends ve{constructor(e){var i,n;super(),this.mimeType=e.mimeType,this.sanitizer=e.sanitizer,this.resolver=e.resolver,this.linkHandler=e.linkHandler,this.translator=(i=e.translator)!==null&&i!==void 0?i:Eo,this.latexTypesetter=e.latexTypesetter,this.markdownParser=(n=e.markdownParser)!==null&&n!==void 0?n:null,this.node.dataset.mimeType=this.mimeType}async renderModel(e,i){if(!i)for(;this.node.firstChild;)this.node.removeChild(this.node.firstChild);this.toggleClass("jp-mod-trusted",e.trusted),await this.render(e);let{fragment:n}=e.metadata;n&&this.setFragment(n)}setFragment(e){}},z0=class extends Cl{constructor(e){super(e),this.addClass("jp-RenderedHTMLCommon")}setFragment(e){let i;try{i=this.node.querySelector(e.startsWith("#")?`#${CSS.escape(e.slice(1))}`:e)}catch(n){console.warn("Unable to set URI fragment identifier.",n)}i&&i.scrollIntoView()}},B0=class extends z0{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedHTML")}render(e){return this._rendered=gE({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter,translator:this.translator})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},F0=class extends Cl{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedLatex")}render(e){return this._rendered=Wz({host:this.node,source:String(e.data[this.mimeType]),shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},H0=class extends Cl{constructor(e){super(e),this.addClass("jp-RenderedImage")}render(e){let i=e.metadata[this.mimeType];return jz({host:this.node,mimeType:this.mimeType,source:String(e.data[this.mimeType]),width:i&&i.width,height:i&&i.height,needsBackground:e.metadata.needs_background,unconfined:i&&i.unconfined})}},j0=class extends z0{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedMarkdown")}render(e){return this._rendered=Ig({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter,markdownParser:this.markdownParser,translator:this.translator})}async renderModel(e){await super.renderModel(e,!0)}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},W0=class extends Cl{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedSVG")}render(e){let i=e.metadata[this.mimeType];return this._rendered=$z({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,unconfined:i&&i.unconfined,translator:this.translator})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},$0=class extends Cl{constructor(e){super(e),this.addClass("jp-RenderedText")}render(e){return vE({host:this.node,sanitizer:this.sanitizer,source:String(e.data[this.mimeType]),translator:this.translator})}},U0=class extends Cl{constructor(e){super(e),this.addClass("jp-RenderedText")}render(e){return qz({host:this.node,sanitizer:this.sanitizer,source:String(e.data[this.mimeType]),linkHandler:this.linkHandler,resolver:this.resolver,translator:this.translator})}},Mg=class extends Cl{constructor(e){super(e),this.addClass("jp-RenderedJavaScript")}render(e){let i=this.translator.load("jupyterlab");return vE({host:this.node,sanitizer:this.sanitizer,source:i.__("JavaScript output is disabled in JupyterLab"),translator:this.translator})}}});var Gz,Yz,Kz,Xz,Jz,Zz,Qz,eB=$(()=>{_E();Gz={safe:!0,mimeTypes:["text/html"],defaultRank:50,createRenderer:t=>new B0(t)},Yz={safe:!0,mimeTypes:["image/bmp","image/png","image/jpeg","image/gif","image/webp"],defaultRank:90,createRenderer:t=>new H0(t)},Kz={safe:!0,mimeTypes:["text/latex"],defaultRank:70,createRenderer:t=>new F0(t)},Xz={safe:!0,mimeTypes:["text/markdown"],defaultRank:60,createRenderer:t=>new j0(t)},Jz={safe:!1,mimeTypes:["image/svg+xml"],defaultRank:80,createRenderer:t=>new W0(t)},Zz={safe:!0,mimeTypes:["application/vnd.jupyter.stderr"],defaultRank:110,createRenderer:t=>new U0(t)},Qz={safe:!0,mimeTypes:["text/plain","application/vnd.jupyter.stdout"],defaultRank:120,createRenderer:t=>new $0(t)}});var q0,xE,yE=$(()=>{q0=class{constructor(e={}){this.trusted=!!e.trusted,this._data=e.data||{},this._metadata=e.metadata||{},this._callback=e.callback||xE.noOp}get data(){return this._data}get metadata(){return this._metadata}setData(e){this._data=e.data||this._data,this._metadata=e.metadata||this._metadata,this._callback(e)}};(function(t){function e(){}t.noOp=e})(xE||(xE={}))});function V0(t){return t.output_type==="execute_result"}function wE(t){return t.output_type==="display_data"}function tB(t){return t.output_type==="update_display_data"}function Xu(t){return t.output_type==="stream"}function iB(t){return t.output_type==="error"}var iX,CE=$(()=>{iX=P(cr())});var rB,Kf,Ju,sB=$(()=>{CE();i1();rB=P(cr());Hs();Kf=class{constructor(e){this._changed=new Te(this),this._raw={};let{data:i,metadata:n,trusted:r}=Ju.getBundleOptions(e);this._data=new Gp({values:i}),this._rawData=i,this._metadata=new Gp({values:n}),this._rawMetadata=n,this.trusted=r;let s=e.value;for(let o in s)switch(o){case"data":case"metadata":break;default:this._raw[o]=Ju.extract(s,o)}this.type=s.output_type,V0(s)?this.executionCount=s.execution_count:this.executionCount=null}get changed(){return this._changed}dispose(){this._data.dispose(),this._metadata.dispose(),Te.clearData(this)}get data(){return this._rawData}get metadata(){return this._rawMetadata}setData(e){e.data&&(this._updateObservable(this._data,e.data),this._rawData=e.data),e.metadata&&(this._updateObservable(this._metadata,e.metadata),this._rawMetadata=e.metadata),this._changed.emit()}toJSON(){let e={};for(let i in this._raw)e[i]=Ju.extract(this._raw,i);switch(this.type){case"display_data":case"execute_result":case"update_display_data":e.data=this.data,e.metadata=this.metadata;break;default:break}return delete e.transient,e}_updateObservable(e,i){let n=e.keys(),r=Object.keys(i);for(let s of n)r.indexOf(s)===-1&&e.delete(s);for(let s of r){let o=e.get(s),a=i[s];o!==a&&e.set(s,a)}}};(function(t){function e(n){return Ju.getData(n)}t.getData=e;function i(n){return Ju.getMetadata(n)}t.getMetadata=i})(Kf||(Kf={}));(function(t){function e(o){let a={};if(V0(o)||wE(o)||tB(o))a=o.data;else if(Xu(o))o.name==="stderr"?a["application/vnd.jupyter.stderr"]=o.text:a["application/vnd.jupyter.stdout"]=o.text;else if(iB(o)){a["application/vnd.jupyter.error"]=o;let l=o.traceback.join(` -`);a["application/vnd.jupyter.stderr"]=l||`${o.ename}: ${o.evalue}`}return s(a)}t.getData=e;function i(o){let a=Object.create(null);if(V0(o)||wE(o))for(let l in o.metadata)a[l]=r(o.metadata,l);return a}t.getMetadata=i;function n(o){let a=e(o.value),l=i(o.value),c=!!o.trusted;return{data:a,metadata:l,trusted:c}}t.getBundleOptions=n;function r(o,a){let l=o[a];return l===void 0||rB.JSONExt.isPrimitive(l)?l:JSON.parse(JSON.stringify(l))}t.extract=r;function s(o){let a=Object.create(null);for(let l in o)a[l]=r(o,l);return a}})(Ju||(Ju={}))});var Zu,Dc,SE,oB=$(()=>{r1();Zu=P(_k());Vp();yE();Dc=class{constructor(e={}){var i,n,r,s,o,a;if(this._id=0,this._ranks={},this._types=null,this._factories={},this.translator=(i=e.translator)!==null&&i!==void 0?i:Eo,this.resolver=(n=e.resolver)!==null&&n!==void 0?n:null,this.linkHandler=(r=e.linkHandler)!==null&&r!==void 0?r:null,this.latexTypesetter=(s=e.latexTypesetter)!==null&&s!==void 0?s:null,this.markdownParser=(o=e.markdownParser)!==null&&o!==void 0?o:null,this.sanitizer=(a=e.sanitizer)!==null&&a!==void 0?a:new wk,e.initialFactories)for(let l of e.initialFactories)this.addFactory(l)}get mimeTypes(){return this._types||(this._types=SE.sortedTypes(this._ranks))}preferredMimeType(e,i="ensure"){if(i==="ensure"||i==="prefer"){for(let n of this.mimeTypes)if(n in e&&this._factories[n].safe)return n}if(i!=="ensure"){for(let n of this.mimeTypes)if(n in e)return n}}createRenderer(e){if(!(e in this._factories))throw new Error(`No factory for mime type: '${e}'`);return this._factories[e].createRenderer({mimeType:e,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,latexTypesetter:this.latexTypesetter,markdownParser:this.markdownParser,translator:this.translator})}createModel(e={}){return new q0(e)}clone(e={}){var i,n,r,s,o,a,l,c,u,d;let f=new Dc({resolver:(n=(i=e.resolver)!==null&&i!==void 0?i:this.resolver)!==null&&n!==void 0?n:void 0,sanitizer:(s=(r=e.sanitizer)!==null&&r!==void 0?r:this.sanitizer)!==null&&s!==void 0?s:void 0,linkHandler:(a=(o=e.linkHandler)!==null&&o!==void 0?o:this.linkHandler)!==null&&a!==void 0?a:void 0,latexTypesetter:(c=(l=e.latexTypesetter)!==null&&l!==void 0?l:this.latexTypesetter)!==null&&c!==void 0?c:void 0,markdownParser:(d=(u=e.markdownParser)!==null&&u!==void 0?u:this.markdownParser)!==null&&d!==void 0?d:void 0,translator:this.translator});return f._factories={...this._factories},f._ranks={...this._ranks},f._id=this._id,f}getFactory(e){return this._factories[e]}addFactory(e,i){i===void 0&&(i=e.defaultRank,i===void 0&&(i=100));for(let n of e.mimeTypes)this._factories[n]=e,this._ranks[n]={rank:i,id:this._id++};this._types=null}removeMimeType(e){delete this._factories[e],delete this._ranks[e],this._types=null}getRank(e){let i=this._ranks[e];return i&&i.rank}setRank(e,i){if(!this._ranks[e])return;let n=this._id++;this._ranks[e]={rank:i,id:n},this._types=null}};(function(t){class e{constructor(n){this._path=n.path,this._contents=n.contents}get path(){return this._path}set path(n){this._path=n}async resolveUrl(n){if(this.isLocal(n)){let r=encodeURI(Zu.PathExt.dirname(this.path));n=Zu.PathExt.resolve(r,n)}return n}async getDownloadUrl(n){return this.isLocal(n)?this._contents.getDownloadUrl(decodeURIComponent(n)):n}isLocal(n,r=!1){return this.isMalformed(n)?!1:Zu.URLExt.isLocal(n,r)||!!this._contents.driveName(decodeURI(n))}async resolvePath(n){let r=Zu.PageConfig.getOption("rootUri").replace("file://","");if(n.startsWith("~/")&&r.startsWith("/home/")&&(n=r.split("/").slice(0,3).join("/")+n.substring(1)),n.startsWith(r)||n.startsWith("./"))try{let s=n.replace(r,"");return{path:(await this._contents.get(s,{content:!1})).path,scope:"server"}}catch{return console.warn(`Could not resolve location of ${n} on server`),null}return{path:n,scope:"kernel"}}isMalformed(n){try{return decodeURI(n),!1}catch(r){if(r instanceof URIError)return!0;throw r}}}t.UrlResolver=e})(Dc||(Dc={}));(function(t){function e(i){return Object.keys(i).sort((n,r)=>{let s=i[n],o=i[r];return s.rank!==o.rank?s.rank-o.rank:s.id-o.id})}t.sortedTypes=e})(SE||(SE={}))});var G0,hbe,fbe,mbe,aB=$(()=>{G0=P(cr()),hbe=new G0.Token("@jupyterlab/rendermime:IRenderMimeRegistry",'A service for the rendermime registry for the application. Use this to create renderers for various mime-types in your extension. Many times it will be easier to create a "mime renderer extension" rather than using this service directly.'),fbe=new G0.Token("@jupyterlab/rendermime:ILatexTypesetter","A service for the LaTeX typesetter for the application. Use this if you want to typeset math in your extension."),mbe=new G0.Token("@jupyterlab/rendermime:IMarkdownParser","A service for rendering markdown syntax as HTML content.")});var Y0=$(()=>{fz();mz();eB();fE();yE();sB();oB();bE();aB();_E()});function*Jf(){}function EE(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function cB(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function uB(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*dB(t,e){let i=0;for(let n of t)yield e(n,i++)}function*hB(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var $e,lB,Xf,K0=$(()=>{(function(t){function e(L,S,x=0,w=-1){let E=L.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Q>>1,K=B+X;x(L[K],S)<0?(B=K+1,Q-=X+1):Q=X}return B}t.lowerBound=a;function l(L,S,x,w=0,E=-1){let N=L.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Q=E-w+1;for(;Q>0;){let X=Q>>1,K=B+X;x(L[K],S)>0?Q=X:(B=K+1,Q-=X+1)}return B}t.upperBound=l;function c(L,S,x){if(L===S)return!0;if(L.length!==S.length)return!1;for(let w=0,E=L.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Q=[];for(let X=0;X=w))return;let N=w-x+1;if(S>0?S=S%N:S<0&&(S=(S%N+N)%N),S===0)return;let B=x+S;f(L,x,B-1),f(L,B,w),f(L,x,w)}t.rotate=h;function m(L,S,x=0,w=-1){let E=L.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wS;--E)L[E]=L[E-1];L[S]=x}t.insert=p;function v(L,S){let x=L.length;if(S<0&&(S+=x),S<0||S>=x)return;let w=L[S];for(let E=S+1;E=x&&B<=w&&L[B]===S||w=x)&&L[B]===S?N++:N>0&&(L[B-N]=L[B]);return N>0&&(L.length=E-N),N}t.removeAllOf=M;function O(L,S,x=0,w=-1){let E,N=n(L,S,x,w);return N!==-1&&(E=v(L,N)),{index:N,value:E}}t.removeFirstWhere=O;function R(L,S,x=-1,w=0){let E,N=r(L,S,x,w);return N!==-1&&(E=v(L,N)),{index:N,value:E}}t.removeLastWhere=R;function _(L,S,x=0,w=-1){let E=L.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&S(L[B],B)||w=x)&&S(L[B],B)?N++:N>0&&(L[B-N]=L[B]);return N>0&&(L.length=E-N),N}t.removeAllWhere=_})($e||($e={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(Xf||(Xf={}))});var LE,wa,Zf,fB=$(()=>{CE();i1();Y0();K0();LE=P(cr());Hs();wa=class{constructor(e={}){if(this.clearNext=!1,this._lastStream="",this._trusted=!1,this._isDisposed=!1,this._stateChanged=new Te(this),this._changed=new Te(this),this._trusted=!!e.trusted,this.contentFactory=e.contentFactory||wa.defaultContentFactory,this.list=new xk,e.values)for(let i of e.values){let n=this._add(i)-1;this.list.get(n).changed.connect(this._onGenericChange,this)}this.list.changed.connect(this._onListChanged,this)}get stateChanged(){return this._stateChanged}get changed(){return this._changed}get length(){return this.list?this.list.length:0}get trusted(){return this._trusted}set trusted(e){if(e===this._trusted)return;let i=this._trusted=e;for(let n=0;ne.toJSON()))}_add(e){let i=this._trusted;if(e=LE.JSONExt.deepCopy(e),Zf.normalize(e),Xu(e)&&this._lastStream&&e.name===this._lastName&&this.shouldCombine({value:e,lastModel:this.list.get(this.length-1)})){this._lastStream+=e.text,this._lastStream=Zf.removeOverwrittenChars(this._lastStream),e.text=this._lastStream;let r=this._createItem({value:e,trusted:i}),s=this.length-1,o=this.list.get(s);return this.list.set(s,r),o.dispose(),this.length}Xu(e)&&(e.text=Zf.removeOverwrittenChars(e.text));let n=this._createItem({value:e,trusted:i});return Xu(e)?(this._lastStream=e.text,this._lastName=e.name):this._lastStream="",this.list.push(n)}shouldCombine(e){return!0}_createItem(e){return this.contentFactory.createOutputModel(e)}_onListChanged(e,i){switch(i.type){case"add":i.newValues.forEach(n=>{n.changed.connect(this._onGenericChange,this)});break;case"remove":i.oldValues.forEach(n=>{n.changed.disconnect(this._onGenericChange,this)});break;case"set":i.newValues.forEach(n=>{n.changed.connect(this._onGenericChange,this)}),i.oldValues.forEach(n=>{n.changed.disconnect(this._onGenericChange,this)});break}this._changed.emit(i)}_onGenericChange(e){let i,n=null;for(i=0;i-1;){let o=s.match(/^(.*)\r+/m)[1],a=s.match(/\r+(.*)$/m)[1];a=a+o.slice(a.length,o.length),s=s.replace(/\r+.*$/m,"\r").replace(/^.*\r/m,a)}return s}function r(s){return n(i(s))}t.removeOverwrittenChars=r})(Zf||(Zf={}))});var mB,li,Qu,ed,Do,pB=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(mB||(mB={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,v=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:v}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let v=u.match(p);return v===null?!1:(u=u.slice(v[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(Do||(Do={}))});var Qf,Sl,gB=$(()=>{Qf=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new Sl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new Sl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof Sl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new Sl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof Sl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new Sl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof Sl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(Qf||(Qf={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(Sl||(Sl={}))});var Mr,Oc,Fe,vB=$(()=>{K0();gB();Mr=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},Oc=class extends Mr{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(S=>x=>{let w=!1;return S.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(S,x){let w=f.get(S);if(!w||w.length===0){y(S,x);return}cB(hB(w),N=>N?v(N,S,x):!0)&&y(S,x)}t.sendMessage=n;function r(S,x){if(!x.isConflatable){C(S,x);return}uB(d,E=>E.handler!==S||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||C(S,x)}t.postMessage=r;function s(S,x){let w=f.get(S);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(S,[x]))}t.installMessageHook=s;function o(S,x){let w=f.get(S);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(S){let x=f.get(S);x&&x.length>0&&($e.fill(x,null),O(x));for(let w of d)w.handler===S&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,M(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(S){let x=m;return m=S,x}t.setExceptionHandler=u;let d=new Qf,f=new WeakMap,h=new Set,m=S=>{console.error(S)},p=!1;function v(S,x,w){let E=!0;try{typeof S=="function"?E=S(x,w):E=S.messageHook(x,w)}catch(N){m(N)}return E}function y(S,x){try{S.processMessage(x)}catch(w){m(w)}}function C(S,x){d.addLast({handler:S,msg:x}),e===null&&(e=i(M))}function M(){if(e=null,d.isEmpty)return;let S={handler:null,msg:null};for(d.addLast(S);;){let x=d.removeFirst();if(x===S)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(S){h.size===0&&i(R),h.add(S)}function R(){h.forEach(_),h.clear()}function _(S){$e.removeAllWhere(S,L)}function L(S){return S===null}})(Fe||(Fe={}))});var Ml,Ca,_s,Ag,pe,X0,Ea,id,em,Pc,Tg,Rg,Oo,Ll,IE,J0,nd,ME,rd,AE,kg,TE,xs,td,Z0,RE,tm,El,Sa,Ar,bB,nX,zc,ro,kE,un,sd,Qi,Il,An,im,Q0,_B,xB,NE,yB,wB,CB=$(()=>{K0();Ml=P(cr());pB();vB();qp();Hs();Yb();Xb();Jb();Gb();Kb();Ca=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=v.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let v=p,y=d;for(let C=0;C0&&p>h;){let v=p/m;for(let y=0;y0&&p>h;){let v=p,y=d;for(let C=0;C=M.maxSize?(p-=M.maxSize-M.size,d-=M.stretch,M.size=M.maxSize,M.done=!0,m--,f--):(p-=O,M.size+=O)}}for(;m>0&&p>h;){let v=p/m;for(let y=0;y=C.maxSize?(p-=C.maxSize-C.size,C.size=C.maxSize,C.done=!0,m--):(p-=v,C.size+=v))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(_s||(_s={}));Ag=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},pe=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=pe.HiddenMode.Display,this.node=X0.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(pe.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&pe.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),Fe.clearData(this),bt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(pe.Flag.IsDisposed)}get isAttached(){return this.testFlag(pe.Flag.IsAttached)}get isHidden(){return this.testFlag(pe.Flag.IsHidden)}get isVisible(){return this.testFlag(pe.Flag.IsVisible)}get title(){return X0.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==pe.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new pe.ChildMessage("child-removed",this);Fe.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new pe.ChildMessage("child-added",this);Fe.sendMessage(this._parent,i)}this.isDisposed||Fe.sendMessage(this,pe.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(pe.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){Fe.postMessage(this,pe.Msg.UpdateRequest)}fit(){Fe.postMessage(this,pe.Msg.FitRequest)}activate(){Fe.postMessage(this,pe.Msg.ActivateRequest)}close(){Fe.sendMessage(this,pe.Msg.CloseRequest)}show(){if(this.testFlag(pe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Fe.sendMessage(this,pe.Msg.BeforeShow),this.clearFlag(pe.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&Fe.sendMessage(this,pe.Msg.AfterShow),this.parent)){let e=new pe.ChildMessage("child-shown",this);Fe.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(pe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Fe.sendMessage(this,pe.Msg.BeforeHide),this.setFlag(pe.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&Fe.sendMessage(this,pe.Msg.AfterHide),this.parent)){let e=new pe.ChildMessage("child-hidden",this);Fe.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(pe.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(pe.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(pe.Flag.IsVisible),this.setFlag(pe.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(pe.Flag.IsVisible),this.clearFlag(pe.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&pe.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case pe.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case pe.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case pe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case pe.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case pe.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case pe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new Mr("before-show"),s.AfterShow=new Mr("after-show"),s.BeforeHide=new Mr("before-hide"),s.AfterHide=new Mr("after-hide"),s.BeforeAttach=new Mr("before-attach"),s.AfterAttach=new Mr("after-attach"),s.BeforeDetach=new Mr("before-detach"),s.AfterDetach=new Mr("after-detach"),s.ParentChanged=new Mr("parent-changed"),s.UpdateRequest=new Oc("update-request"),s.FitRequest=new Oc("fit-request"),s.ActivateRequest=new Oc("activate-request"),s.CloseRequest=new Oc("close-request")}(t.Msg||(t.Msg={}));class e extends Mr{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends Mr{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");Fe.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),Fe.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");Fe.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),Fe.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(pe||(pe={}));(function(t){t.titleProperty=new bt({name:"title",create:i=>new Ag({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(X0||(X0={}));Ea=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),bt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)Fe.sendMessage(i,pe.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)Fe.sendMessage(i,pe.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)Fe.sendMessage(i,e)}onAfterAttach(e){for(let i of this)Fe.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)Fe.sendMessage(i,e)}onAfterDetach(e){for(let i of this)Fe.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||Fe.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||Fe.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||Fe.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||Fe.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return em.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){em.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return em.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){em.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(Ea||(Ea={}));id=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=li.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new bt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(em||(em={}));Pc=class extends Ea{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){$e.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&($e.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=$e.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&Fe.sendMessage(n,pe.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&Fe.sendMessage(n,pe.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&Fe.sendMessage(n,pe.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&Fe.sendMessage(n,pe.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(Tg||(Tg={}));Rg=Tg,Oo=class extends Pc{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=Tg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=Tg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return Ll.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);_s.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new id(i),r=Ll.createHandle(this.renderer),s=Ll.averageSize(this._sizers),o=Ll.createSizer(s);$e.insert(this._items,e,n),$e.insert(this._sizers,e,o),$e.insert(this._handles,e,r),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){$e.move(this._items,e,i),$e.move(this._sizers,e,i),$e.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=$e.removeAt(this._items,e),r=$e.removeAt(this._handles,e);$e.removeAt(this._sizers,e),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=Oo.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=li.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&Fe.sendMessage(this.parent.parent,pe.Msg.FitRequest),this._dirty&&Fe.sendMessage(this.parent,pe.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=_s.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new Ca;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof Oo&&o.parent.fit()}})(Ll||(Ll={}));IE=class extends Oo{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=Rg.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=J0.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${Ml.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=J0.createTitle(this.renderer,i.title);$e.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){$e.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=$e.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(J0||(J0={}));nd=class extends pe{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=ME.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new Pc}t.createLayout=e})(ME||(ME={}));rd=class extends nd{constructor(e={}){super({layout:AE.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=$e.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=mn.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return Oo.getStretch(r)}t.getStretch=i;function n(r,s){Oo.setStretch(r,s)}t.setStretch=n})(rd||(rd={}));(function(t){function e(i){return i.layout||new Oo({renderer:i.renderer||rd.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(AE||(AE={}));kg=class extends rd{constructor(e={}){super({...e,layout:TE.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=$e.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=$e.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=$e.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends rd.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(kg||(kg={}));(function(t){function e(i){return i.layout||new IE({renderer:i.renderer||kg.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(TE||(TE={}));xs=class extends Pc{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=Rg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=Rg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){$e.insert(this._items,e,new id(i)),$e.insert(this._sizers,e,new Ca),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){$e.move(this._items,e,i),$e.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=$e.removeAt(this._items,e);$e.removeAt(this._sizers,e),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new bt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof xs&&r.parent.fit()}})(td||(td={}));Z0=class extends nd{constructor(e={}){super({layout:RE.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return xs.getStretch(s)}t.getStretch=e;function i(s,o){xs.setStretch(s,o)}t.setStretch=i;function n(s){return xs.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){xs.setSizeBasis(s,o)}t.setSizeBasis=r})(Z0||(Z0={}));(function(t){function e(i){return i.layout||new xs(i)}t.createLayout=e})(RE||(RE={}));tm=class extends pe{constructor(e){super({node:El.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(pe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||tm.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=El.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>El.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){$e.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=El.search(this._items,i),this._activeIndex=i?$e.findFirstIndex(r,El.canActivate):-1),!i&&r.length===0){ri.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});ri.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];li.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=$e.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eS-x),_=C.slice(0,R),L=C.slice(R);for(let S=0,x=L.length;Sp.command===h&&Ml.JSONExt.deepEqual(p.args,m))||null}}})(El||(El={}));Sa=class extends pe{constructor(e){super({node:Ar.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(pe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||Sa.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!Ar.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}ri.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=js().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=Ar.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=$e.findFirstIndex(this.contentNode.children,r=>li.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(li.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(Ar.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;Sa.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,Fe.sendMessage(this,pe.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];Ar.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},Ar.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},Ar.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){Ar.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Le.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Le.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Le.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Le.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Le.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Le.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?ur.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(Sa||(Sa={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),v=document.createElement("ul");return v.className="lm-Menu-content",p.appendChild(v),v.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,v){return new m(p.commands,v)}t.createItem=a;function l(p,v,y){for(let C=p;C;C=C.childMenu)if(li.hitTest(C.node,v,y))return!0;return!1}t.hitTestMenus=l;function c(p){let v=new Array(p.length);$e.fill(v,!1);let y=0,C=p.length;for(;y=0;--M){let R=p[M];if(R.isVisible){if(R.type!=="separator")break;v[M]=!0}}let O=!1;for(;++yL+x&&(v=L+x-Q),!M&&y+X>S+w&&(y>S+w?y=S+w-X:y=y-X),B.transform=`translate(${Math.max(0,v)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,v){let y=n(),C=y.pageXOffset,M=y.pageYOffset,O=y.clientWidth,R=y.clientHeight;Fe.sendMessage(p,pe.Msg.UpdateRequest);let _=R,L=p.node,S=L.style;S.opacity="0",S.maxHeight=`${_}px`,pe.attach(p,document.body);let{width:x,height:w}=L.getBoundingClientRect(),E=li.boxSizing(p.node),N=v.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>C+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Q=N.top-E.borderTop-E.paddingTop;Q+w>M+R&&(Q=N.bottom+E.borderBottom+E.paddingBottom-w),S.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Q)}px`,S.opacity="1"}t.openSubmenu=f;function h(p,v,y){let C=-1,M=-1,O=!1,R=v.toUpperCase();for(let _=0,L=p.length;_=0&&EC.command===v&&Ml.JSONExt.deepEqual(C.args,y))||null}return null}}})(Ar||(Ar={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,v=h.length;p=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=ro.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?($e.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||($e.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=$e.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lli.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=$e.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(nX.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=$e.findFirstIndex(n,o=>li.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!ro.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=ro.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=mn.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&ro.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}ro.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=$e.findFirstIndex(s,c=>li.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;ro.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=ro.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,ro.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||($e.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),Fe.sendMessage(this,pe.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(ro.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Le.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Le.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Le.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Le.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Le.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(zc||(zc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof Ag?u:new Ag(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,v,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,v=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,v=f.clientY,y=d.contentRect.height);let C=d.index,M=p-d.tabPressPos,O=M+d.tabSize;for(let R=0,_=u.length;R<_;++R){let L,S=d.tabLayout[R],x=S.pos+(S.size>>1);if(Rd.index&&O>x)L=`${-d.tabSize-S.margin}px`,C=Math.max(C,R);else if(R===d.index){let w=v-m,E=y-(d.tabPos+d.tabSize);L=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else L="";h==="horizontal"?u[R].style.left=L:u[R].style.top=L}d.targetIndex=C}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let v=u.tabLayout[u.targetIndex];h=v.pos+v.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(ro||(ro={}));kE=class extends Ea{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=Rg.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:pe.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=Rg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():Jf()}widgets(){return this._root?this._root.iterUserWidgets():Jf()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():Jf()}tabBars(){return this._root?this._root.iterTabBars():Jf()}handles(){return this._root?this._root.iterHandles():Jf()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),_s.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=un.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=un.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=li.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new id(e)),this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(un.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===pe.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=pe.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=$e.removeFirstOf(n.children,i),s=$e.removeAt(n.handles,r);if($e.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof un.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=$e.removeAt(c.handles,u);$e.removeAt(c.children,u),$e.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,v=0,y=1/0,C=1/0,M=m.get(this.tabBar),O=this.tabBar.currentTitle,R=O?m.get(O.owner):void 0,[_,L]=this.sizers;return M&&M.fit(),R&&R.fit(),M&&!M.isHidden?(p=Math.max(p,M.minWidth),v+=M.minHeight,_.minSize=M.minHeight,_.maxSize=M.maxHeight):(_.minSize=0,_.maxSize=0),R&&!R.isHidden?(p=Math.max(p,R.minWidth),v+=R.minHeight,L.minSize=R.minHeight,L.maxSize=1/0):(L.minSize=0,L.maxSize=1/0),{minWidth:p,minHeight:v,maxWidth:y,maxHeight:C}}update(h,m,p,v,y,C){this._top=m,this._left=h,this._width=p,this._height=v;let M=C.get(this.tabBar),O=this.tabBar.currentTitle,R=O?C.get(O.owner):void 0;if(_s.calc(this.sizers,v),M&&!M.isHidden){let _=this.sizers[0].size;M.update(h,m,p,_),m+=_}if(R&&!R.isHidden){let _=this.sizers[1].size;R.update(h,m,p,_)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;mv.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,v)=>p+v.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(v=>v.size),p=m.reduce((v,y)=>v+y,0);if(p===0)for(let v=m.length-1;v>-1;v--)m[v]=1/h;else for(let v=m.length-1;v>-1;v--)m[v]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",v=Math.max(0,this.children.length-1)*h,y=p?v:0,C=p?0:v,M=1/0,O=1/0;for(let R=0,_=this.children.length;R<_;++R){let L=this.children[R].fit(h,m);p?(C=Math.max(C,L.minHeight),y+=L.minWidth,this.sizers[R].minSize=L.minWidth):(y=Math.max(y,L.minWidth),C+=L.minHeight,this.sizers[R].minSize=L.minHeight)}return{minWidth:y,minHeight:C,maxWidth:M,maxHeight:O}}update(h,m,p,v,y,C){let M=this.orientation==="horizontal",O=Math.max(0,this.children.length-1)*y,R=Math.max(0,(M?p:v)-O);if(this.normalized){for(let _ of this.sizers)_.sizeHint*=R;this.normalized=!1}_s.calc(this.sizers,R);for(let _=0,L=this.children.length;_=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],v=[];for(let y=0,C=f.children.length;y{let C=n(v,h,m),M=e(f.sizes[y]),O=h.createHandle();p.children.push(C),p.handles.push(O),p.sizers.push(M),C.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(un||(un={}));sd=class extends pe{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||sd.defaultRenderer,this._edges=e.edges||Qi.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new kE({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new sd.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Qi.createSingleDocumentConfig(this));break;default:throw"unreachable"}Fe.postMessage(this,Qi.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=EE(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(Qu.IS_EDGE||Qu.IS_IE)&&Fe.flush(),Fe.postMessage(this,Qi.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),Fe.postMessage(this,Qi.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Qi.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Qi.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),Fe.postMessage(this,Qi.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Qi.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof pe)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Qi.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),Fe.postMessage(this,Qi.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=EE(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=mn.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),Fe.postMessage(this,Qi.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Qi.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=li.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Qi.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Qi.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Qi.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Qi.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Qi.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){Fe.postMessage(this,Qi.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(Qu.IS_EDGE||Qu.IS_IE)&&Fe.flush(),Fe.postMessage(this,Qi.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new Ml.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new mn({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new zc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(sd||(sd={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new Oc("layout-modified"),t.isGeneratedTabBarProperty=new bt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!li.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let M=r.node.getBoundingClientRect(),O=s-M.left+1,R=o-M.top+1,_=M.right-s,L=M.bottom-o;switch(Math.min(R,_,L,O)){case R:if(Rp&&f>p&&d>v&&h>v)return{zone:"widget-all",target:c};u/=p,d/=v,f/=p,h/=v;let y=Math.min(u,d,f,h),C;switch(y){case u:C="widget-left";break;case d:C="widget-top";break;case f:C="widget-right";break;case h:C="widget-bottom";break;default:throw"unreachable"}return{zone:C,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Qi||(Qi={}));Il=class extends Ea{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new Ca],this._columnSizers=[new Ca],this._box=null,e.rowCount!==void 0&&An.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&An.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=An.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=An.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(An.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(An.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=An.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=An.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=An.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=An.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){$e.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new id(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=$e.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=$e.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Fe.sendMessage(e,pe.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof Il&&l.parent.fit()}})(An||(An={}));im=class extends pe{constructor(e={}){super({node:Q0.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(pe.Flag.DisallowLayout),this.renderer=e.renderer||im.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){$e.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&($e.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=$e.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new Sa({commands:new ur}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let v=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,v,!1),u[a]=r.renderItem({title:v.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}ri.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cli.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);Sa.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=$e.findFirstIndex(this.contentNode.children,r=>li.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;Sa.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,Fe.sendMessage(this,pe.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Le.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Le.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Le.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Le.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(im||(im={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===pe.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=pe.HiddenMode.Scale),i.hiddenMode=pe.HiddenMode.Scale):i.hiddenMode=pe.HiddenMode.Display,$e.insert(this._items,e,new id(i)),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){$e.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=$e.removeAt(this._items,e);this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Fe.sendMessage(i,pe.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===pe.HiddenMode.Scale&&(i.hiddenMode=pe.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=pe.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{r1();IB=P(t1());Vp();ex=P(cr());qp();Hs();CB();rX="jp-OutputArea",SB="jp-OutputArea-child",EB="jp-OutputArea-output",LB="jp-OutputArea-prompt",sX="jp-OutputArea-stdin-hiding",oX="jp-OutputPrompt",aX="jp-OutputArea-executeResult",lX="jp-OutputArea-stdin-item",cX="jp-Stdin",uX="jp-Stdin-prompt",dX="jp-Stdin-input",hX="jp-OutputArea-promptOverlay",ys=class extends pe{constructor(e){var i,n,r,s;super(),this.outputLengthChanged=new Te(this),this._onIOPub=a=>{let l=this.model,c=a.header.msg_type,u,f=(a.content.transient||{}).display_id,h;switch(c){case"execute_result":case"display_data":case"stream":case"error":u={...a.content,output_type:c},l.add(u);break;case"clear_output":{let m=a.content.wait;l.clear(m);break}case"update_display_data":if(u={...a.content,output_type:"display_data"},h=this._displayIdMap.get(f),h)for(let m of h)l.set(m,u);break;case"status":{a.content.execution_state==="idle"&&(this._pendingInput=!1);break}default:break}f&&c==="display_data"&&(h=this._displayIdMap.get(f)||[],h.push(l.length-1),this._displayIdMap.set(f,h))},this._onExecuteReply=a=>{let l=this.model,c=a.content;if(c.status!=="ok")return;let u=c&&c.payload;if(!u||!u.length)return;let d=u.filter(m=>m.source==="page");if(!d.length)return;let h={output_type:"display_data",data:JSON.parse(JSON.stringify(d[0])).data,metadata:{}};l.add(h)},this._displayIdMap=new Map,this._minHeightTimeout=null,this._inputRequested=new Te(this),this._toggleScrolling=new Te(this),this._initialize=new Te(this),this._outputTracker=new yk({namespace:ex.UUID.uuid4()}),this._inputHistoryScope="global",this._pendingInput=!1,super.layout=new Pc,this.addClass(rX),this.contentFactory=(i=e.contentFactory)!==null&&i!==void 0?i:ys.defaultContentFactory,this.rendermime=e.rendermime,this._maxNumberOutputs=(n=e.maxNumberOutputs)!==null&&n!==void 0?n:1/0,this._translator=(r=e.translator)!==null&&r!==void 0?r:Eo,this._inputHistoryScope=(s=e.inputHistoryScope)!==null&&s!==void 0?s:"global";let o=this.model=e.model;for(let a=0;a{this._pendingInput=!1}).catch(()=>{}),this.model.clear(),this.widgets.length&&(this._clear(),this.outputLengthChanged.emit(Math.min(this.model.length,this._maxNumberOutputs))),e.onIOPub=this._onIOPub,e.onReply=this._onExecuteReply,e.onStdin=i=>{IB.KernelMessage.isInputRequestMsg(i)&&this.onInputRequest(i,e)})}get inputRequested(){return this._inputRequested}get pendingInput(){return this._pendingInput}get maxNumberOutputs(){return this._maxNumberOutputs}set maxNumberOutputs(e){if(e<=0){console.warn("OutputArea.maxNumberOutputs must be strictly positive.");return}let i=this._maxNumberOutputs;this._maxNumberOutputs=e,i{this._toggleScrolling.emit()}),this.node.appendChild(e),requestAnimationFrame(()=>{this._initialize.emit()})}_moveDisplayIdIndices(e,i){this._displayIdMap.forEach(n=>{let r=e+i,s=n.length;for(let o=s-1;o>=0;--o){let a=n[o];a>=e&&a=r&&(n[o]-=i)}})}onStateChanged(e,i){let n=Math.min(this.model.length,this._maxNumberOutputs);if(i){if(i>=this._maxNumberOutputs)return;this._setOutput(i,this.model.get(i))}else for(let r=0;r{this.isDisposed||(this.node.style.minHeight="")},50)}onInputRequest(e,i){let n=this.contentFactory,r=e.content.prompt,s=e.content.password,o=new nd;o.addClass(SB),o.addClass(lX);let a=n.createOutputPrompt();a.addClass(LB),o.addWidget(a),this._pendingInput=!0;let l=n.createStdin({parent_header:e.header,prompt:r,password:s,future:i,translator:this._translator,inputHistoryScope:this._inputHistoryScope});l.addClass(EB),o.addWidget(l),this.model.length>=this.maxNumberOutputs&&(this.maxNumberOutputs=this.model.length),this._inputRequested.emit(l);let c=l.node.getElementsByTagName("input")[0];l.value.then(u=>{this.model.length>=this.maxNumberOutputs&&(this.maxNumberOutputs=this.model.length+1),o.addClass(sX),this.model.add({output_type:"stream",name:"stdin",text:u+` -`}),c.focus(),this._pendingInput=!1,window.setTimeout(()=>{let d=document.activeElement;o.dispose(),d&&d instanceof HTMLElement&&d.focus()},500)}),this.layout.addWidget(o)}_setOutput(e,i){if(e>=this._maxNumberOutputs)return;let n=this.layout.widgets[e],r=n.widgets?n.widgets.filter(o=>"renderModel"in o).pop():n,s=this.rendermime.preferredMimeType(i.data,i.trusted?"any":"ensure");Al.currentPreferredMimetype.get(r)===s&&ys.isIsolated(s,i.metadata)===r instanceof Al.IsolatedRenderer?r.renderModel(i):(this.layout.widgets[e].dispose(),this._insertOutput(e,i))}_insertOutput(e,i){if(e>this._maxNumberOutputs)return;let n=this.layout;if(e===this._maxNumberOutputs){let r=new Al.TrimmedOutputs(this._maxNumberOutputs,()=>{let s=this._maxNumberOutputs;this._maxNumberOutputs=1/0,this._showTrimmedOutputs(s)});n.insertWidget(e,this._wrappedOutput(r))}else{let r=this.createOutputItem(i);r?r.toggleClass(aX,i.executionCount!==null):r=new pe,this._outputTracker.has(r)||this._outputTracker.add(r),n.insertWidget(e,r)}}get outputTracker(){return this._outputTracker}_showTrimmedOutputs(e){this.widgets[e].dispose();for(let i=e;i{let o=document.createElement("pre"),a=this._translator.load("jupyterlab");o.textContent=a.__("Javascript Error: %1",s.message),n.node.appendChild(o),n.node.className="lm-Widget jp-RenderedText",n.node.setAttribute("data-mime-type","application/vnd.jupyter.stderr")}),n}_wrappedOutput(e,i=null){let n=new Al.OutputPanel;n.addClass(SB);let r=this.contentFactory.createOutputPrompt();return r.executionCount=i,r.addClass(LB),n.addWidget(r),e.addClass(EB),n.addWidget(e),n}};(function(t){async function e(r,s,o,a){var l;let c=!0;a&&Array.isArray(a.tags)&&a.tags.indexOf("raises-exception")!==-1&&(c=!1);let u={code:r,stop_on_error:c},d=(l=o.session)===null||l===void 0?void 0:l.kernel;if(!d)throw new Error("Session has no kernel.");let f=d.requestExecute(u,!1,a);return s.future=f,f.done}t.execute=e;function i(r,s){let o=s[r];return o&&o.isolated!==void 0?!!o.isolated:!!s.isolated}t.isIsolated=i;class n{createOutputPrompt(){return new DE}createStdin(s){return new Hn(s)}}t.ContentFactory=n,t.defaultContentFactory=new n})(ys||(ys={}));DE=class extends pe{constructor(){super(),this._executionCount=null,this.addClass(oX)}get executionCount(){return this._executionCount}set executionCount(e){this._executionCount=e,e===null?this.node.textContent="":this.node.textContent=`[${e}]:`}},Hn=class extends pe{static _historyIx(e,i){let n=Hn._history.get(e);if(!n)return;let r=n.length;if(i<=0)return r+i}static _historyAt(e,i){let n=Hn._history.get(e);if(!n)return;let r=n.length,s=Hn._historyIx(e,i);if(s!==void 0&&s1e3&&n.shift()}static _historySearch(e,i,n,r=!0){let s=Hn._history.get(e),o=s.length,a=Hn._historyIx(e,n),l=c=>c.search(i)!==-1;if(a!==void 0)if(r){if(a===0)return;let c=s.slice(0,a).findLastIndex(l);if(c!==-1)return c-o}else{if(a>=o-1)return;let c=s.slice(a+1).findIndex(l);if(c!==-1)return c-o+a+1}}constructor(e){var i;super({node:Al.createInputWidgetNode(e.prompt,e.password)}),this._promise=new ex.PromiseDelegate,this._resolved=!1,this.addClass(cX),this._future=e.future,this._historyIndex=0,this._historyKey=e.inputHistoryScope==="session"?e.parent_header.session:"",this._historyPat="",this._parentHeader=e.parent_header,this._password=e.password,this._trans=((i=e.translator)!==null&&i!==void 0?i:Eo).load("jupyterlab"),this._value=e.prompt+" ",this._input=this.node.getElementsByTagName("input")[0],this._password?this._input.placeholder="":this._input.placeholder=this._trans.__("\u2191\u2193 for history. Search history with c-\u2191/c-\u2193"),Hn._history.has(this._historyKey)||Hn._history.set(this._historyKey,[])}get value(){return this._promise.promise.then(()=>this._value)}handleEvent(e){if(this._resolved){e.preventDefault();return}let i=this._input;if(e.type==="keydown"){if(e.key==="Enter")this.resetSearch(),this._future.sendInputReply({status:"ok",value:i.value},this._parentHeader),this._password?this._value+="\xB7\xB7\xB7\xB7\xB7\xB7\xB7\xB7":(this._value+=i.value,Hn._historyPush(this._historyKey,i.value)),this._resolved=!0,this._promise.resolve(void 0);else if(e.key==="Escape")this.resetSearch(),i.blur();else if(e.ctrlKey&&(e.key==="ArrowUp"||e.key==="ArrowDown")){this._historyPat===""&&(this._historyPat=i.value);let n=e.key==="ArrowUp",r=Hn._historySearch(this._historyKey,this._historyPat,this._historyIndex,n);if(r!==void 0){let s=Hn._historyAt(this._historyKey,r);s!==void 0&&(this._historyIndex===0&&(this._valueCache=i.value),this._setInputValue(s),this._historyIndex=r,e.preventDefault())}}else if(e.key==="ArrowUp"){this.resetSearch();let n=Hn._historyAt(this._historyKey,this._historyIndex-1);n&&(this._historyIndex===0&&(this._valueCache=i.value),this._setInputValue(n),--this._historyIndex,e.preventDefault())}else if(e.key==="ArrowDown"&&(this.resetSearch(),this._historyIndex!==0))if(this._historyIndex===-1)this._setInputValue(this._valueCache),++this._historyIndex;else{let n=Hn._historyAt(this._historyKey,this._historyIndex+1);n&&(this._setInputValue(n),++this._historyIndex)}}}resetSearch(){this._historyPat=""}onAfterAttach(e){this._input.addEventListener("keydown",this),this._input.focus()}onBeforeDetach(e){this._input.removeEventListener("keydown",this)}_setInputValue(e){this._input.value=e,this._input.setSelectionRange(e.length,e.length)}};Hn._history=new Map;(function(t){function e(s,o){let a=document.createElement("div"),l=document.createElement("pre");l.className=uX,l.textContent=s;let c=document.createElement("input");return c.className=dX,o&&(c.type="password"),a.appendChild(l),l.appendChild(c),a}t.createInputWidgetNode=e;class i extends pe{constructor(o){super({node:document.createElement("iframe")}),this.addClass("jp-mod-isolated"),this._wrapped=o;let a=this.node;a.frameBorder="0",a.scrolling="auto",a.addEventListener("load",()=>{a.contentDocument.open(),a.contentDocument.write(this._wrapped.node.innerHTML),a.contentDocument.close();let l=a.contentDocument.body;a.style.height=`${l.scrollHeight}px`,a.heightChangeObserver=new ResizeObserver(()=>{a.style.height=`${l.scrollHeight}px`}),a.heightChangeObserver.observe(l)})}renderModel(o){return this._wrapped.renderModel(o)}}t.IsolatedRenderer=i,t.currentPreferredMimetype=new bt({name:"preferredMimetype",create:s=>""});class n extends nd{constructor(o){super(o)}_onContext(o){this.node.focus()}onAfterAttach(o){super.onAfterAttach(o),this.node.addEventListener("contextmenu",this._onContext.bind(this))}onBeforeDetach(o){super.onAfterDetach(o),this.node.removeEventListener("contextmenu",this._onContext.bind(this))}}t.OutputPanel=n;class r extends pe{constructor(o,a){let l=document.createElement("div"),c=`The first ${o} are displayed`,u="Show more outputs";l.insertAdjacentHTML("afterbegin",` -
${u}
-
`),super({node:l}),this._onClick=a,this.addClass("jp-TrimmedOutputs"),this.addClass("jp-RenderedHTMLCommon")}handleEvent(o){o.type==="click"&&this._onClick(o)}onAfterAttach(o){super.onAfterAttach(o),this.node.addEventListener("click",this)}onBeforeDetach(o){super.onBeforeDetach(o),this.node.removeEventListener("click",this)}}t.TrimmedOutputs=r})(Al||(Al={}))});var OE=$(()=>{fB();MB()});var AB,Ng,TB=$(()=>{AB=P(cr());Su();Ng=class extends $s{constructor(e,i){super(),this._manager=new AB.PromiseDelegate,this._rerenderMimeModel=null,this.mimeType=e.mimeType,i&&(this.manager=i)}set manager(e){e.restored.connect(this._rerender,this),this._manager.resolve(e)}async renderModel(e){let i=e.data[this.mimeType];this.node.textContent="Loading widget...";let n=await this._manager.promise;if(i.model_id==="")return this.hide(),Promise.resolve();let r;try{r=await n.get_model(i.model_id)}catch(o){if(n.restoredStatus){this.node.textContent="Error displaying widget: model not found",this.addClass("jupyter-widgets"),console.error(o);return}this._rerenderMimeModel=e;return}this._rerenderMimeModel=null;let s;try{let o=await n.create_view(r);s=o.luminoWidget||o.pWidget}catch(o){this.node.textContent="Error displaying widget",this.addClass("jupyter-widgets"),console.error(o);return}this.node.textContent="",this.addWidget(s),s.disposed.connect(()=>{this.hide(),i.model_id=""})}dispose(){this.isDisposed||(this._manager=null,super.dispose())}_rerender(){this._rerenderMimeModel&&(this.node.textContent="",this.removeClass("jupyter-widgets"),this.renderModel(this._rerenderMimeModel))}}});function ix(t,e){return t.filter(i=>e.indexOf(i)===-1)}function nm(t,e){return(0,RB.default)(t,e)}function ci(){return tx.UUID.uuid4()}function Ia(t){let e=Object.keys(t),i=[];return e.forEach(function(n){i.push(t[n])}),Promise.all(i).then(n=>{let r={};for(let s=0;s{tx=P(cr()),RB=P($q());La=Object.assign||function(t,...e){for(let i=1;i - - - - - - - -`});function PB(t,e,i){if(t==null)return this;let n;if(OB.JSONExt.isObject(t)?(n=t,i=e):(n={})[t]=e,i||(i={}),!this._validate(n,i))return!1;let r=i.unset,s=i.silent,o=[],a=this._changing;this._changing=!0;try{a||(this._previousAttributes=Object.assign({},this.attributes),this.changed={});let l=this.attributes,c=this.changed,u=this._previousAttributes;for(let d in n)e=n[d],nm(l[d],e)||o.push(d),nm(u[d],e)?delete c[d]:c[d]=e,r?delete l[d]:l[d]=e;if(this.id=this.get(this.idAttribute),!s){o.length&&(this._pending=i);for(let d=0;d{od();OB=P(cr())});var Og,PE,Pg,rx,zE,BB,Ma,Tl,FB,BE,HB,jB,FE,HE,jE,WB,$B,sx,WE,UB,Ii=$(()=>{Og="1.13.7",PE=typeof self=="object"&&self.self===self&&self||typeof globalThis=="object"&&globalThis.global===globalThis&&globalThis||Function("return this")()||{},Pg=Array.prototype,rx=Object.prototype,zE=typeof Symbol<"u"?Symbol.prototype:null,BB=Pg.push,Ma=Pg.slice,Tl=rx.toString,FB=rx.hasOwnProperty,BE=typeof ArrayBuffer<"u",HB=typeof DataView<"u",jB=Array.isArray,FE=Object.keys,HE=Object.create,jE=BE&&ArrayBuffer.isView,WB=isNaN,$B=isFinite,sx=!{toString:null}.propertyIsEnumerable("toString"),WE=["valueOf","isPrototypeOf","toString","propertyIsEnumerable","hasOwnProperty","toLocaleString"],UB=Math.pow(2,53)-1});function ei(t,e){return e=e==null?t.length-1:+e,function(){for(var i=Math.max(arguments.length-e,0),n=Array(i),r=0;r{});function hr(t){var e=typeof t;return e==="function"||e==="object"&&!!t}var Fc=$(()=>{});function ox(t){return t===null}var qB=$(()=>{});function rm(t){return t===void 0}var $E=$(()=>{});function sm(t){return t===!0||t===!1||Tl.call(t)==="[object Boolean]"}var UE=$(()=>{Ii()});function ax(t){return!!(t&&t.nodeType===1)}var VB=$(()=>{});function $t(t){var e="[object "+t+"]";return function(i){return Tl.call(i)===e}}var Qn=$(()=>{Ii()});var ad,lx=$(()=>{Qn();ad=$t("String")});var zg,qE=$(()=>{Qn();zg=$t("Number")});var VE,GB=$(()=>{Qn();VE=$t("Date")});var GE,YB=$(()=>{Qn();GE=$t("RegExp")});var YE,KB=$(()=>{Qn();YE=$t("Error")});var Bg,KE=$(()=>{Qn();Bg=$t("Symbol")});var Fg,XE=$(()=>{Qn();Fg=$t("ArrayBuffer")});var XB,mX,ui,ws=$(()=>{Qn();Ii();XB=$t("Function"),mX=PE.document&&PE.document.childNodes;typeof/./!="function"&&typeof Int8Array!="object"&&typeof mX!="function"&&(XB=function(t){return typeof t=="function"||!1});ui=XB});var JE,JB=$(()=>{Qn();JE=$t("Object")});var cx,om,am=$(()=>{Ii();JB();cx=HB&&(!/\[native code\]/.test(String(DataView))||JE(new DataView(new ArrayBuffer(8)))),om=typeof Map<"u"&&JE(new Map)});function gX(t){return t!=null&&ui(t.getInt8)&&Fg(t.buffer)}var pX,Hc,ux=$(()=>{Qn();ws();XE();am();pX=$t("DataView");Hc=cx?gX:pX});var Tr,jc=$(()=>{Ii();Qn();Tr=jB||$t("Array")});function er(t,e){return t!=null&&FB.call(t,e)}var Rl=$(()=>{Ii()});var ZE,ld,dx=$(()=>{Qn();Rl();ZE=$t("Arguments");(function(){ZE(arguments)||(ZE=function(t){return er(t,"callee")})})();ld=ZE});function hx(t){return!Bg(t)&&$B(t)&&!isNaN(parseFloat(t))}var ZB=$(()=>{Ii();KE()});function lm(t){return zg(t)&&WB(t)}var QE=$(()=>{Ii();qE()});function cm(t){return function(){return t}}var eL=$(()=>{});function Hg(t){return function(e){var i=t(e);return typeof i=="number"&&i>=0&&i<=UB}}var tL=$(()=>{Ii()});function jg(t){return function(e){return e?.[t]}}var iL=$(()=>{});var cd,fx=$(()=>{iL();cd=jg("byteLength")});var QB,eF=$(()=>{tL();fx();QB=Hg(cd)});function bX(t){return jE?jE(t)&&!Hc(t):QB(t)&&vX.test(Tl.call(t))}var vX,Wg,nL=$(()=>{Ii();ux();eL();eF();vX=/\[object ((I|Ui)nt(8|16|32)|Float(32|64)|Uint8Clamped|Big(I|Ui)nt64)Array\]/;Wg=BE?bX:cm(!1)});var bi,Cs=$(()=>{iL();bi=jg("length")});function _X(t){for(var e={},i=t.length,n=0;n{Ii();ws();Rl()});function kt(t){if(!hr(t))return[];if(FE)return FE(t);var e=[];for(var i in t)er(t,i)&&e.push(i);return sx&&$g(t,e),e}var jn=$(()=>{Fc();Ii();Rl();rL()});function mx(t){if(t==null)return!0;var e=bi(t);return typeof e=="number"&&(Tr(t)||ad(t)||ld(t))?e===0:bi(kt(t))===0}var tF=$(()=>{Cs();jc();lx();dx();jn()});function um(t,e){var i=kt(e),n=i.length;if(t==null)return!n;for(var r=Object(t),s=0;s{jn()});function xt(t){if(t instanceof xt)return t;if(!(this instanceof xt))return new xt(t);this._wrapped=t}var Qr=$(()=>{Ii();xt.VERSION=Og;xt.prototype.value=function(){return this._wrapped};xt.prototype.valueOf=xt.prototype.toJSON=xt.prototype.value;xt.prototype.toString=function(){return String(this._wrapped)}});function px(t){return new Uint8Array(t.buffer||t,t.byteOffset||0,cd(t))}var iF=$(()=>{fx()});function oL(t,e,i,n){if(t===e)return t!==0||1/t===1/e;if(t==null||e==null)return!1;if(t!==t)return e!==e;var r=typeof t;return r!=="function"&&r!=="object"&&typeof e!="object"?!1:rF(t,e,i,n)}function rF(t,e,i,n){t instanceof xt&&(t=t._wrapped),e instanceof xt&&(e=e._wrapped);var r=Tl.call(t);if(r!==Tl.call(e))return!1;if(cx&&r=="[object Object]"&&Hc(t)){if(!Hc(e))return!1;r=nF}switch(r){case"[object RegExp]":case"[object String]":return""+t==""+e;case"[object Number]":return+t!=+t?+e!=+e:+t==0?1/+t===1/e:+t==+e;case"[object Date]":case"[object Boolean]":return+t==+e;case"[object Symbol]":return zE.valueOf.call(t)===zE.valueOf.call(e);case"[object ArrayBuffer]":case nF:return rF(px(t),px(e),i,n)}var s=r==="[object Array]";if(!s&&Wg(t)){var o=cd(t);if(o!==cd(e))return!1;if(t.buffer===e.buffer&&t.byteOffset===e.byteOffset)return!0;s=!0}if(!s){if(typeof t!="object"||typeof e!="object")return!1;var a=t.constructor,l=e.constructor;if(a!==l&&!(ui(a)&&a instanceof a&&ui(l)&&l instanceof l)&&"constructor"in t&&"constructor"in e)return!1}i=i||[],n=n||[];for(var c=i.length;c--;)if(i[c]===t)return n[c]===e;if(i.push(t),n.push(e),s){if(c=t.length,c!==e.length)return!1;for(;c--;)if(!oL(t[c],e[c],i,n))return!1}else{var u=kt(t),d;if(c=u.length,kt(e).length!==c)return!1;for(;c--;)if(d=u[c],!(er(e,d)&&oL(t[d],e[d],i,n)))return!1}return i.pop(),n.pop(),!0}function gx(t,e){return oL(t,e)}var nF,sF=$(()=>{Qr();Ii();fx();nL();ws();am();ux();jn();Rl();iF();nF="[object DataView]"});function so(t){if(!hr(t))return[];var e=[];for(var i in t)e.push(i);return sx&&$g(t,e),e}var dm=$(()=>{Fc();Ii();rL()});function hm(t){var e=bi(t);return function(i){if(i==null)return!1;var n=so(i);if(bi(n))return!1;for(var r=0;r{Cs();ws();dm();aL="forEach",oF="has",lL=["clear","delete"],aF=["get",oF,"set"],lF=lL.concat(aL,aF),cL=lL.concat(aF),cF=["add"].concat(lL,aL,oF)});var uL,uF=$(()=>{Qn();am();vx();uL=om?hm(lF):$t("Map")});var dL,dF=$(()=>{Qn();am();vx();dL=om?hm(cL):$t("WeakMap")});var hL,hF=$(()=>{Qn();am();vx();hL=om?hm(cF):$t("Set")});var fL,fF=$(()=>{Qn();fL=$t("WeakSet")});function es(t){for(var e=kt(t),i=e.length,n=Array(i),r=0;r{jn()});function bx(t){for(var e=kt(t),i=e.length,n=Array(i),r=0;r{jn()});function fm(t){for(var e={},i=kt(t),n=0,r=i.length;n{jn()});function dd(t){var e=[];for(var i in t)ui(t[i])&&e.push(i);return e.sort()}var pL=$(()=>{ws()});function hd(t,e){return function(i){var n=arguments.length;if(e&&(i=Object(i)),n<2||i==null)return i;for(var r=1;r{});var Ug,gL=$(()=>{_x();dm();Ug=hd(so)});var Wc,xx=$(()=>{_x();jn();Wc=hd(kt)});var qg,vL=$(()=>{_x();dm();qg=hd(so,!0)});function xX(){return function(){}}function Vg(t){if(!hr(t))return{};if(HE)return HE(t);var e=xX();e.prototype=t;var i=new e;return e.prototype=null,i}var bL=$(()=>{Fc();Ii()});function yx(t,e){var i=Vg(t);return e&&Wc(i,e),i}var pF=$(()=>{bL();xx()});function wx(t){return hr(t)?Tr(t)?t.slice():Ug({},t):t}var gF=$(()=>{Fc();jc();gL()});function Cx(t,e){return e(t),t}var vF=$(()=>{});function Gg(t){return Tr(t)?t:[t]}var _L=$(()=>{Qr();jc();xt.toPath=Gg});function Po(t){return xt.toPath(t)}var mm=$(()=>{Qr();_L()});function fd(t,e){for(var i=e.length,n=0;n{});function pm(t,e,i){var n=fd(t,Po(e));return rm(n)?i:n}var xL=$(()=>{mm();Sx();$E()});function Ex(t,e){e=Po(e);for(var i=e.length,n=0;n{Rl();mm()});function $c(t){return t}var Lx=$(()=>{});function zo(t){return t=Wc({},t),function(e){return um(e,t)}}var Yg=$(()=>{xx();sL()});function Uc(t){return t=Po(t),function(e){return fd(e,t)}}var Ix=$(()=>{Sx();mm()});function Bo(t,e,i){if(e===void 0)return t;switch(i??3){case 1:return function(n){return t.call(e,n)};case 3:return function(n,r,s){return t.call(e,n,r,s)};case 4:return function(n,r,s,o){return t.call(e,n,r,s,o)}}return function(){return t.apply(e,arguments)}}var gm=$(()=>{});function Kg(t,e,i){return t==null?$c:ui(t)?Bo(t,e,i):hr(t)&&!Tr(t)?zo(t):Uc(t)}var yL=$(()=>{Lx();ws();Fc();jc();Yg();Ix();gm()});function md(t,e){return Kg(t,e,1/0)}var wL=$(()=>{Qr();yL();xt.iteratee=md});function ti(t,e,i){return xt.iteratee!==md?xt.iteratee(t,e):Kg(t,e,i)}var Rr=$(()=>{Qr();yL();wL()});function Mx(t,e,i){e=ti(e,i);for(var n=kt(t),r=n.length,s={},o=0;o{Rr();jn()});function vm(){}var CL=$(()=>{});function Ax(t){return t==null?vm:function(e){return pm(t,e)}}var xF=$(()=>{CL();xL()});function Tx(t,e,i){var n=Array(Math.max(0,t));e=Bo(e,i,1);for(var r=0;r{gm()});function pd(t,e){return e==null&&(e=t,t=0),t+Math.floor(Math.random()*(e-t+1))}var SL=$(()=>{});var kl,Rx=$(()=>{kl=Date.now||function(){return new Date().getTime()}});function Xg(t){var e=function(s){return t[s]},i="(?:"+kt(t).join("|")+")",n=RegExp(i),r=RegExp(i,"g");return function(s){return s=s==null?"":""+s,n.test(s)?s.replace(r,e):s}}var EL=$(()=>{jn()});var kx,LL=$(()=>{kx={"&":"&","<":"<",">":">",'"':""","'":"'","`":"`"}});var IL,wF=$(()=>{EL();LL();IL=Xg(kx)});var CF,SF=$(()=>{mL();LL();CF=fm(kx)});var ML,EF=$(()=>{EL();SF();ML=Xg(CF)});var AL,TL=$(()=>{Qr();AL=xt.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g}});function CX(t){return"\\"+yX[t]}function Nx(t,e,i){!e&&i&&(e=i),e=qg({},e,xt.templateSettings);var n=RegExp([(e.escape||RL).source,(e.interpolate||RL).source,(e.evaluate||RL).source].join("|")+"|$","g"),r=0,s="__p+='";t.replace(n,function(c,u,d,f,h){return s+=t.slice(r,h).replace(wX,CX),r=h+c.length,u?s+=`'+ -((__t=(`+u+`))==null?'':_.escape(__t))+ -'`:d?s+=`'+ -((__t=(`+d+`))==null?'':__t)+ -'`:f&&(s+=`'; -`+f+` -__p+='`),c}),s+=`'; -`;var o=e.variable;if(o){if(!SX.test(o))throw new Error("variable is not a bare identifier: "+o)}else s=`with(obj||{}){ -`+s+`} -`,o="obj";s=`var __t,__p='',__j=Array.prototype.join,print=function(){__p+=__j.call(arguments,'');}; -`+s+`return __p; -`;var a;try{a=new Function(o,"_",s)}catch(c){throw c.source=s,c}var l=function(c){return a.call(this,c,xt)};return l.source="function("+o+`){ -`+s+"}",l}var RL,yX,wX,SX,LF=$(()=>{vL();Qr();TL();RL=/(.)^/,yX={"'":"'","\\":"\\","\r":"r","\n":"n","\u2028":"u2028","\u2029":"u2029"},wX=/\\|'|\r|\n|\u2028|\u2029/g;SX=/^\s*(\w|\$)+\s*$/});function Dx(t,e,i){e=Po(e);var n=e.length;if(!n)return ui(i)?i.call(t):i;for(var r=0;r{ws();mm()});function Ox(t){var e=++EX+"";return t?t+e:e}var EX,MF=$(()=>{EX=0});function Px(t){var e=xt(t);return e._chain=!0,e}var AF=$(()=>{Qr()});function Jg(t,e,i,n,r){if(!(n instanceof e))return t.apply(i,r);var s=Vg(t.prototype),o=t.apply(s,r);return hr(o)?o:s}var kL=$(()=>{bL();Fc()});var NL,Nl,Zg=$(()=>{Zr();kL();Qr();NL=ei(function(t,e){var i=NL.placeholder,n=function(){for(var r=0,s=e.length,o=Array(s),a=0;a{Zr();ws();kL();Qg=ei(function(t,e,i){if(!ui(t))throw new TypeError("Bind must be called on a function");var n=ei(function(r){return Jg(t,n,e,this,i.concat(r))});return n})});var _i,ts=$(()=>{tL();Cs();_i=Hg(bi)});function is(t,e,i,n){if(n=n||[],!e&&e!==0)e=1/0;else if(e<=0)return n.concat(t);for(var r=n.length,s=0,o=bi(t);s1)is(a,e-1,i,n),r=n.length;else for(var l=0,c=a.length;l{Cs();ts();jc();dx()});var OL,TF=$(()=>{Zr();gd();DL();OL=ei(function(t,e){e=is(e,!1,!1);var i=e.length;if(i<1)throw new Error("bindAll must be passed function names");for(;i--;){var n=e[i];t[n]=Qg(t[n],t)}return t})});function zx(t,e){var i=function(n){var r=i.cache,s=""+(e?e.apply(this,arguments):n);return er(r,s)||(r[s]=t.apply(this,arguments)),r[s]};return i.cache={},i}var RF=$(()=>{Rl()});var ev,PL=$(()=>{Zr();ev=ei(function(t,e,i){return setTimeout(function(){return t.apply(null,i)},e)})});var zL,kF=$(()=>{Zg();PL();Qr();zL=Nl(ev,xt,1)});function Bx(t,e,i){var n,r,s,o,a=0;i||(i={});var l=function(){a=i.leading===!1?0:kl(),n=null,o=t.apply(r,s),n||(r=s=null)},c=function(){var u=kl();!a&&i.leading===!1&&(a=u);var d=e-(u-a);return r=this,s=arguments,d<=0||d>e?(n&&(clearTimeout(n),n=null),a=u,o=t.apply(r,s),n||(r=s=null)):!n&&i.trailing!==!1&&(n=setTimeout(l,d)),o};return c.cancel=function(){clearTimeout(n),a=0,n=r=s=null},c}var NF=$(()=>{Rx()});function Fx(t,e,i){var n,r,s,o,a,l=function(){var u=kl()-r;e>u?n=setTimeout(l,e-u):(n=null,i||(o=t.apply(a,s)),n||(s=a=null))},c=ei(function(u){return a=this,s=u,r=kl(),n||(n=setTimeout(l,e),i&&(o=t.apply(a,s))),o});return c.cancel=function(){clearTimeout(n),n=s=a=null},c}var DF=$(()=>{Zr();Rx()});function Hx(t,e){return Nl(e,t)}var OF=$(()=>{Zg()});function qc(t){return function(){return!t.apply(this,arguments)}}var jx=$(()=>{});function Wx(){var t=arguments,e=t.length-1;return function(){for(var i=e,n=t[e].apply(this,arguments);i--;)n=t[i].call(this,n);return n}}var PF=$(()=>{});function $x(t,e){return function(){if(--t<1)return e.apply(this,arguments)}}var zF=$(()=>{});function bm(t,e){var i;return function(){return--t>0&&(i=e.apply(this,arguments)),t<=1&&(e=null),i}}var BL=$(()=>{});var FL,BF=$(()=>{Zg();BL();FL=Nl(bm,2)});function _m(t,e,i){e=ti(e,i);for(var n=kt(t),r,s=0,o=n.length;s{Rr();jn()});function tv(t){return function(e,i,n){i=ti(i,n);for(var r=bi(e),s=t>0?0:r-1;s>=0&&s{Rr();Cs()});var vd,Ux=$(()=>{jL();vd=tv(1)});var iv,WL=$(()=>{jL();iv=tv(-1)});function xm(t,e,i,n){i=ti(i,n,1);for(var r=i(e),s=0,o=bi(t);s{Rr();Cs()});function nv(t,e,i){return function(n,r,s){var o=0,a=bi(n);if(typeof s=="number")t>0?o=s>=0?s:Math.max(s+a,o):a=s>=0?Math.min(s+1,a):s+a+1;else if(i&&s&&a)return s=i(n,r),n[s]===r?s:-1;if(r!==r)return s=e(Ma.call(n,o,a),lm),s>=0?s+o:-1;for(s=t>0?o:a-1;s>=0&&s{Cs();Ii();QE()});var rv,qL=$(()=>{$L();Ux();UL();rv=nv(1,vd,xm)});var VL,FF=$(()=>{WL();UL();VL=nv(-1,iv)});function bd(t,e,i){var n=_i(t)?vd:_m,r=n(t,e,i);if(r!==void 0&&r!==-1)return t[r]}var GL=$(()=>{ts();Ux();HL()});function qx(t,e){return bd(t,zo(e))}var HF=$(()=>{GL();Yg()});function Wn(t,e,i){e=Bo(e,i);var n,r;if(_i(t))for(n=0,r=t.length;n{gm();ts();jn()});function kr(t,e,i){e=ti(e,i);for(var n=!_i(t)&&kt(t),r=(n||t).length,s=Array(r),o=0;o{Rr();ts();jn()});function sv(t){var e=function(i,n,r,s){var o=!_i(i)&&kt(i),a=(o||i).length,l=t>0?0:a-1;for(s||(r=i[o?o[l]:l],l+=t);l>=0&&l=3;return e(i,Bo(n,s,4),r,o)}}var YL=$(()=>{ts();jn();gm()});var ov,jF=$(()=>{YL();ov=sv(1)});var Vx,WF=$(()=>{YL();Vx=sv(-1)});function Ss(t,e,i){var n=[];return e=ti(e,i),Wn(t,function(r,s,o){e(r,s,o)&&n.push(r)}),n}var ym=$(()=>{Rr();Vc()});function Gx(t,e,i){return Ss(t,qc(ti(e)),i)}var $F=$(()=>{ym();jx();Rr()});function av(t,e,i){e=ti(e,i);for(var n=!_i(t)&&kt(t),r=(n||t).length,s=0;s{Rr();ts();jn()});function lv(t,e,i){e=ti(e,i);for(var n=!_i(t)&&kt(t),r=(n||t).length,s=0;s{Rr();ts();jn()});function fr(t,e,i,n){return _i(t)||(t=es(t)),(typeof i!="number"||n)&&(i=0),rv(t,e,i)>=0}var wm=$(()=>{ts();ud();qL()});var KL,VF=$(()=>{Zr();ws();_d();Sx();mm();KL=ei(function(t,e,i){var n,r;return ui(e)?r=e:(e=Po(e),n=e.slice(0,-1),e=e[e.length-1]),kr(t,function(s){var o=r;if(!o){if(n&&n.length&&(s=fd(s,n)),s==null)return;o=s[e]}return o==null?o:o.apply(s,i)})})});function Gc(t,e){return kr(t,Uc(e))}var Yx=$(()=>{_d();Ix()});function Kx(t,e){return Ss(t,zo(e))}var GF=$(()=>{ym();Yg()});function Cm(t,e,i){var n=-1/0,r=-1/0,s,o;if(e==null||typeof e=="number"&&typeof t[0]!="object"&&t!=null){t=_i(t)?t:es(t);for(var a=0,l=t.length;an&&(n=s)}else e=ti(e,i),Wn(t,function(c,u,d){o=e(c,u,d),(o>r||o===-1/0&&n===-1/0)&&(n=c,r=o)});return n}var XL=$(()=>{ts();ud();Rr();Vc()});function Xx(t,e,i){var n=1/0,r=1/0,s,o;if(e==null||typeof e=="number"&&typeof t[0]!="object"&&t!=null){t=_i(t)?t:es(t);for(var a=0,l=t.length;a{ts();ud();Rr();Vc()});function Sm(t){return t?Tr(t)?Ma.call(t):ad(t)?t.match(LX):_i(t)?kr(t,$c):es(t):[]}var LX,JL=$(()=>{jc();Ii();lx();ts();_d();Lx();ud();LX=/[^\ud800-\udfff]|[\ud800-\udbff][\udc00-\udfff]|[\ud800-\udfff]/g});function Em(t,e,i){if(e==null||i)return _i(t)||(t=es(t)),t[pd(t.length-1)];var n=Sm(t),r=bi(n);e=Math.max(Math.min(e,r),0);for(var s=r-1,o=0;o{ts();ud();Cs();SL();JL()});function Jx(t){return Em(t,1/0)}var KF=$(()=>{ZL()});function Zx(t,e,i){var n=0;return e=ti(e,i),Gc(kr(t,function(r,s,o){return{value:r,index:n++,criteria:e(r,s,o)}}).sort(function(r,s){var o=r.criteria,a=s.criteria;if(o!==a){if(o>a||o===void 0)return 1;if(o{Rr();Yx();_d()});function Dl(t,e){return function(i,n,r){var s=e?[[],[]]:{};return n=ti(n,r),Wn(i,function(o,a){var l=n(o,a,i);t(s,o,l)}),s}}var cv=$(()=>{Rr();Vc()});var QL,JF=$(()=>{cv();Rl();QL=Dl(function(t,e,i){er(t,i)?t[i].push(e):t[i]=[e]})});var eI,ZF=$(()=>{cv();eI=Dl(function(t,e,i){t[i]=e})});var tI,QF=$(()=>{cv();Rl();tI=Dl(function(t,e,i){er(t,i)?t[i]++:t[i]=1})});var iI,eH=$(()=>{cv();iI=Dl(function(t,e,i){t[i?0:1].push(e)},!0)});function Qx(t){return t==null?0:_i(t)?t.length:kt(t).length}var tH=$(()=>{ts();jn()});function nI(t,e,i){return e in i}var iH=$(()=>{});var uv,rI=$(()=>{Zr();ws();gm();dm();iH();gd();uv=ei(function(t,e){var i={},n=e[0];if(t==null)return i;ui(n)?(e.length>1&&(n=Bo(n,e[1])),e=so(t)):(n=nI,e=is(e,!1,!1),t=Object(t));for(var r=0,s=e.length;r{Zr();ws();jx();_d();gd();wm();rI();sI=ei(function(t,e){var i=e[0],n;return ui(i)?(i=qc(i),e.length>1&&(n=e[1])):(e=kr(is(e,!1,!1),String),i=function(r,s){return!fr(e,s)}),uv(t,i,n)})});function Lm(t,e,i){return Ma.call(t,0,Math.max(0,t.length-(e==null||i?1:e)))}var oI=$(()=>{Ii()});function Im(t,e,i){return t==null||t.length<1?e==null||i?void 0:[]:e==null||i?t[0]:Lm(t,t.length-e)}var rH=$(()=>{oI()});function Yc(t,e,i){return Ma.call(t,e==null||i?1:e)}var aI=$(()=>{Ii()});function ey(t,e,i){return t==null||t.length<1?e==null||i?void 0:[]:e==null||i?t[t.length-1]:Yc(t,Math.max(0,t.length-e))}var sH=$(()=>{aI()});function ty(t){return Ss(t,Boolean)}var oH=$(()=>{ym()});function iy(t,e){return is(t,e,!1)}var aH=$(()=>{gd()});var dv,lI=$(()=>{Zr();gd();ym();wm();dv=ei(function(t,e){return e=is(e,!0,!0),Ss(t,function(i){return!fr(e,i)})})});var cI,lH=$(()=>{Zr();lI();cI=ei(function(t,e){return dv(t,e)})});function xd(t,e,i,n){sm(e)||(n=i,i=e,e=!1),i!=null&&(i=ti(i,n));for(var r=[],s=[],o=0,a=bi(t);o{UE();Rr();Cs();wm()});var dI,cH=$(()=>{Zr();uI();gd();dI=ei(function(t){return xd(is(t,!0,!0))})});function ny(t){for(var e=[],i=arguments.length,n=0,r=bi(t);n{Cs();wm()});function yd(t){for(var e=t&&Cm(t,bi).length||0,i=Array(e),n=0;n{XL();Cs();Yx()});var fI,dH=$(()=>{Zr();hI();fI=ei(yd)});function ry(t,e){for(var i={},n=0,r=bi(t);n{Cs()});function sy(t,e,i){e==null&&(e=t||0,t=0),i||(i=e{});function oy(t,e){if(e==null||e<1)return[];for(var i=[],n=0,r=t.length;n{Ii()});function Mm(t,e){return t._chain?xt(e).chain():e}var mI=$(()=>{Qr()});function Am(t){return Wn(dd(t),function(e){var i=xt[e]=t[e];xt.prototype[e]=function(){var n=[this._wrapped];return BB.apply(n,arguments),Mm(this,i.apply(xt,n))}}),xt}var pH=$(()=>{Qr();Vc();pL();Ii();mI()});var gH,vH=$(()=>{Qr();Vc();Ii();mI();Wn(["pop","push","reverse","shift","sort","splice","unshift"],function(t){var e=Pg[t];xt.prototype[t]=function(){var i=this._wrapped;return i!=null&&(e.apply(i,arguments),(t==="shift"||t==="splice")&&i.length===0&&delete i[0]),Mm(this,i)}});Wn(["concat","join","slice"],function(t){var e=Pg[t];xt.prototype[t]=function(){var i=this._wrapped;return i!=null&&(i=e.apply(i,arguments)),Mm(this,i)}});gH=xt});var pI={};Eh(pI,{VERSION:()=>Og,after:()=>$x,all:()=>av,allKeys:()=>so,any:()=>lv,assign:()=>Wc,before:()=>bm,bind:()=>Qg,bindAll:()=>OL,chain:()=>Px,chunk:()=>oy,clone:()=>wx,collect:()=>kr,compact:()=>ty,compose:()=>Wx,constant:()=>cm,contains:()=>fr,countBy:()=>tI,create:()=>yx,debounce:()=>Fx,default:()=>gH,defaults:()=>qg,defer:()=>zL,delay:()=>ev,detect:()=>bd,difference:()=>dv,drop:()=>Yc,each:()=>Wn,escape:()=>IL,every:()=>av,extend:()=>Ug,extendOwn:()=>Wc,filter:()=>Ss,find:()=>bd,findIndex:()=>vd,findKey:()=>_m,findLastIndex:()=>iv,findWhere:()=>qx,first:()=>Im,flatten:()=>iy,foldl:()=>ov,foldr:()=>Vx,forEach:()=>Wn,functions:()=>dd,get:()=>pm,groupBy:()=>QL,has:()=>Ex,head:()=>Im,identity:()=>$c,include:()=>fr,includes:()=>fr,indexBy:()=>eI,indexOf:()=>rv,initial:()=>Lm,inject:()=>ov,intersection:()=>ny,invert:()=>fm,invoke:()=>KL,isArguments:()=>ld,isArray:()=>Tr,isArrayBuffer:()=>Fg,isBoolean:()=>sm,isDataView:()=>Hc,isDate:()=>VE,isElement:()=>ax,isEmpty:()=>mx,isEqual:()=>gx,isError:()=>YE,isFinite:()=>hx,isFunction:()=>ui,isMap:()=>uL,isMatch:()=>um,isNaN:()=>lm,isNull:()=>ox,isNumber:()=>zg,isObject:()=>hr,isRegExp:()=>GE,isSet:()=>hL,isString:()=>ad,isSymbol:()=>Bg,isTypedArray:()=>Wg,isUndefined:()=>rm,isWeakMap:()=>dL,isWeakSet:()=>fL,iteratee:()=>md,keys:()=>kt,last:()=>ey,lastIndexOf:()=>VL,map:()=>kr,mapObject:()=>Mx,matcher:()=>zo,matches:()=>zo,max:()=>Cm,memoize:()=>zx,methods:()=>dd,min:()=>Xx,mixin:()=>Am,negate:()=>qc,noop:()=>vm,now:()=>kl,object:()=>ry,omit:()=>sI,once:()=>FL,pairs:()=>bx,partial:()=>Nl,partition:()=>iI,pick:()=>uv,pluck:()=>Gc,property:()=>Uc,propertyOf:()=>Ax,random:()=>pd,range:()=>sy,reduce:()=>ov,reduceRight:()=>Vx,reject:()=>Gx,rest:()=>Yc,restArguments:()=>ei,result:()=>Dx,sample:()=>Em,select:()=>Ss,shuffle:()=>Jx,size:()=>Qx,some:()=>lv,sortBy:()=>Zx,sortedIndex:()=>xm,tail:()=>Yc,take:()=>Im,tap:()=>Cx,template:()=>Nx,templateSettings:()=>AL,throttle:()=>Bx,times:()=>Tx,toArray:()=>Sm,toPath:()=>Gg,transpose:()=>yd,unescape:()=>ML,union:()=>dI,uniq:()=>xd,unique:()=>xd,uniqueId:()=>Ox,unzip:()=>yd,values:()=>es,where:()=>Kx,without:()=>cI,wrap:()=>Hx,zip:()=>fI});var ay=$(()=>{Ii();Zr();Fc();qB();$E();UE();VB();lx();qE();GB();YB();KB();KE();XE();ux();jc();ws();dx();ZB();QE();nL();tF();sL();sF();uF();dF();hF();fF();jn();dm();ud();mF();mL();pL();gL();xx();vL();pF();gF();vF();xL();bF();_F();Lx();eL();CL();_L();Ix();xF();Yg();yF();SL();Rx();wF();EF();TL();LF();IF();MF();AF();wL();Zg();DL();TF();RF();PL();kF();NF();DF();OF();jx();PF();zF();BL();BF();HL();Ux();WL();$L();qL();FF();GL();HF();Vc();_d();jF();WF();ym();$F();UF();qF();wm();VF();Yx();GF();XL();YF();KF();ZL();XF();JF();ZF();QF();eH();JL();tH();rI();nH();rH();oI();sH();aI();oH();aH();lH();uI();cH();uH();lI();hI();dH();hH();fH();mH();pH();vH()});var gI,bH,_H=$(()=>{ay();ay();gI=Am(pI);gI._=gI;bH=gI});var xH={};Eh(xH,{VERSION:()=>Og,after:()=>$x,all:()=>av,allKeys:()=>so,any:()=>lv,assign:()=>Wc,before:()=>bm,bind:()=>Qg,bindAll:()=>OL,chain:()=>Px,chunk:()=>oy,clone:()=>wx,collect:()=>kr,compact:()=>ty,compose:()=>Wx,constant:()=>cm,contains:()=>fr,countBy:()=>tI,create:()=>yx,debounce:()=>Fx,default:()=>bH,defaults:()=>qg,defer:()=>zL,delay:()=>ev,detect:()=>bd,difference:()=>dv,drop:()=>Yc,each:()=>Wn,escape:()=>IL,every:()=>av,extend:()=>Ug,extendOwn:()=>Wc,filter:()=>Ss,find:()=>bd,findIndex:()=>vd,findKey:()=>_m,findLastIndex:()=>iv,findWhere:()=>qx,first:()=>Im,flatten:()=>iy,foldl:()=>ov,foldr:()=>Vx,forEach:()=>Wn,functions:()=>dd,get:()=>pm,groupBy:()=>QL,has:()=>Ex,head:()=>Im,identity:()=>$c,include:()=>fr,includes:()=>fr,indexBy:()=>eI,indexOf:()=>rv,initial:()=>Lm,inject:()=>ov,intersection:()=>ny,invert:()=>fm,invoke:()=>KL,isArguments:()=>ld,isArray:()=>Tr,isArrayBuffer:()=>Fg,isBoolean:()=>sm,isDataView:()=>Hc,isDate:()=>VE,isElement:()=>ax,isEmpty:()=>mx,isEqual:()=>gx,isError:()=>YE,isFinite:()=>hx,isFunction:()=>ui,isMap:()=>uL,isMatch:()=>um,isNaN:()=>lm,isNull:()=>ox,isNumber:()=>zg,isObject:()=>hr,isRegExp:()=>GE,isSet:()=>hL,isString:()=>ad,isSymbol:()=>Bg,isTypedArray:()=>Wg,isUndefined:()=>rm,isWeakMap:()=>dL,isWeakSet:()=>fL,iteratee:()=>md,keys:()=>kt,last:()=>ey,lastIndexOf:()=>VL,map:()=>kr,mapObject:()=>Mx,matcher:()=>zo,matches:()=>zo,max:()=>Cm,memoize:()=>zx,methods:()=>dd,min:()=>Xx,mixin:()=>Am,negate:()=>qc,noop:()=>vm,now:()=>kl,object:()=>ry,omit:()=>sI,once:()=>FL,pairs:()=>bx,partial:()=>Nl,partition:()=>iI,pick:()=>uv,pluck:()=>Gc,property:()=>Uc,propertyOf:()=>Ax,random:()=>pd,range:()=>sy,reduce:()=>ov,reduceRight:()=>Vx,reject:()=>Gx,rest:()=>Yc,restArguments:()=>ei,result:()=>Dx,sample:()=>Em,select:()=>Ss,shuffle:()=>Jx,size:()=>Qx,some:()=>lv,sortBy:()=>Zx,sortedIndex:()=>xm,tail:()=>Yc,take:()=>Im,tap:()=>Cx,template:()=>Nx,templateSettings:()=>AL,throttle:()=>Bx,times:()=>Tx,toArray:()=>Sm,toPath:()=>Gg,transpose:()=>yd,unescape:()=>ML,union:()=>dI,uniq:()=>xd,unique:()=>xd,uniqueId:()=>Ox,unzip:()=>yd,values:()=>es,where:()=>Kx,without:()=>cI,wrap:()=>Hx,zip:()=>fI});var yH=$(()=>{_H();ay()});var wd=Ye((wH,ly)=>{(function(t,e){"use strict";typeof ly=="object"&&typeof ly.exports=="object"?ly.exports=t.document?e(t,!0):function(i){if(!i.document)throw new Error("jQuery requires a window with a document");return e(i)}:e(t)})(typeof window<"u"?window:wH,function(t,e){"use strict";var i=[],n=Object.getPrototypeOf,r=i.slice,s=i.flat?function(g){return i.flat.call(g)}:function(g){return i.concat.apply([],g)},o=i.push,a=i.indexOf,l={},c=l.toString,u=l.hasOwnProperty,d=u.toString,f=d.call(Object),h={},m=function(b){return typeof b=="function"&&typeof b.nodeType!="number"&&typeof b.item!="function"},p=function(b){return b!=null&&b===b.window},v=t.document,y={type:!0,src:!0,nonce:!0,noModule:!0};function C(g,b,I){I=I||v;var A,k,D=I.createElement("script");if(D.text=g,b)for(A in y)k=b[A]||b.getAttribute&&b.getAttribute(A),k&&D.setAttribute(A,k);I.head.appendChild(D).parentNode.removeChild(D)}function M(g){return g==null?g+"":typeof g=="object"||typeof g=="function"?l[c.call(g)]||"object":typeof g}var O="3.7.1",R=/HTML$/i,_=function(g,b){return new _.fn.init(g,b)};_.fn=_.prototype={jquery:O,constructor:_,length:0,toArray:function(){return r.call(this)},get:function(g){return g==null?r.call(this):g<0?this[g+this.length]:this[g]},pushStack:function(g){var b=_.merge(this.constructor(),g);return b.prevObject=this,b},each:function(g){return _.each(this,g)},map:function(g){return this.pushStack(_.map(this,function(b,I){return g.call(b,I,b)}))},slice:function(){return this.pushStack(r.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},even:function(){return this.pushStack(_.grep(this,function(g,b){return(b+1)%2}))},odd:function(){return this.pushStack(_.grep(this,function(g,b){return b%2}))},eq:function(g){var b=this.length,I=+g+(g<0?b:0);return this.pushStack(I>=0&&I0&&b-1 in g}function S(g,b){return g.nodeName&&g.nodeName.toLowerCase()===b.toLowerCase()}var x=i.pop,w=i.sort,E=i.splice,N="[\\x20\\t\\r\\n\\f]",B=new RegExp("^"+N+"+|((?:^|[^\\\\])(?:\\\\.)*)"+N+"+$","g");_.contains=function(g,b){var I=b&&b.parentNode;return g===I||!!(I&&I.nodeType===1&&(g.contains?g.contains(I):g.compareDocumentPosition&&g.compareDocumentPosition(I)&16))};var Q=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\x80-\uFFFF\w-]/g;function X(g,b){return b?g==="\0"?"\uFFFD":g.slice(0,-1)+"\\"+g.charCodeAt(g.length-1).toString(16)+" ":"\\"+g}_.escapeSelector=function(g){return(g+"").replace(Q,X)};var K=v,V=o;(function(){var g,b,I,A,k,D=V,z,G,q,ie,fe,ye=_.expando,ue=0,Ne=0,gt=Bb(),zt=Bb(),It=Bb(),Gn=Bb(),wn=function(W,J){return W===J&&(k=!0),0},Qo="checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped",ea="(?:\\\\[\\da-fA-F]{1,6}"+N+"?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+",Dt="\\["+N+"*("+ea+")(?:"+N+"*([*^$|!~]?=)"+N+`*(?:'((?:\\\\.|[^\\\\'])*)'|"((?:\\\\.|[^\\\\"])*)"|(`+ea+"))|)"+N+"*\\]",xu=":("+ea+`)(?:\\((('((?:\\\\.|[^\\\\'])*)'|"((?:\\\\.|[^\\\\"])*)")|((?:\\\\.|[^\\\\()[\\]]|`+Dt+")*)|.*)\\)|)",jt=new RegExp(N+"+","g"),qi=new RegExp("^"+N+"*,"+N+"*"),Wp=new RegExp("^"+N+"*([>+~]|"+N+")"+N+"*"),VC=new RegExp(N+"|>"),ta=new RegExp(xu),$p=new RegExp("^"+ea+"$"),ia={ID:new RegExp("^#("+ea+")"),CLASS:new RegExp("^\\.("+ea+")"),TAG:new RegExp("^("+ea+"|[*])"),ATTR:new RegExp("^"+Dt),PSEUDO:new RegExp("^"+xu),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+N+"*(even|odd|(([+-]|)(\\d*)n|)"+N+"*(?:([+-]|)"+N+"*(\\d+)|))"+N+"*\\)|)","i"),bool:new RegExp("^(?:"+Qo+")$","i"),needsContext:new RegExp("^"+N+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+N+"*((?:-\\d)?\\d*)"+N+"*\\)|)(?=[^-]|$)","i")},sc=/^(?:input|select|textarea|button)$/i,oc=/^h\d$/i,Ps=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,GC=/[+~]/,Ga=new RegExp("\\\\[\\da-fA-F]{1,6}"+N+"?|\\\\([^\\r\\n\\f])","g"),Ya=function(W,J){var oe="0x"+W.slice(1)-65536;return J||(oe<0?String.fromCharCode(oe+65536):String.fromCharCode(oe>>10|55296,oe&1023|56320))},Oq=function(){ac()},Pq=Hb(function(W){return W.disabled===!0&&S(W,"fieldset")},{dir:"parentNode",next:"legend"});function zq(){try{return z.activeElement}catch{}}try{D.apply(i=r.call(K.childNodes),K.childNodes),i[K.childNodes.length].nodeType}catch{D={apply:function(J,oe){V.apply(J,r.call(oe))},call:function(J){V.apply(J,r.call(arguments,1))}}}function Zt(W,J,oe,he){var xe,qe,Je,nt,Ze,Tt,mt,_t=J&&J.ownerDocument,Rt=J?J.nodeType:9;if(oe=oe||[],typeof W!="string"||!W||Rt!==1&&Rt!==9&&Rt!==11)return oe;if(!he&&(ac(J),J=J||z,q)){if(Rt!==11&&(Ze=Ps.exec(W)))if(xe=Ze[1]){if(Rt===9)if(Je=J.getElementById(xe)){if(Je.id===xe)return D.call(oe,Je),oe}else return oe;else if(_t&&(Je=_t.getElementById(xe))&&Zt.contains(J,Je)&&Je.id===xe)return D.call(oe,Je),oe}else{if(Ze[2])return D.apply(oe,J.getElementsByTagName(W)),oe;if((xe=Ze[3])&&J.getElementsByClassName)return D.apply(oe,J.getElementsByClassName(xe)),oe}if(!Gn[W+" "]&&(!ie||!ie.test(W))){if(mt=W,_t=J,Rt===1&&(VC.test(W)||Wp.test(W))){for(_t=GC.test(W)&&YC(J.parentNode)||J,(_t!=J||!h.scope)&&((nt=J.getAttribute("id"))?nt=_.escapeSelector(nt):J.setAttribute("id",nt=ye)),Tt=Up(W),qe=Tt.length;qe--;)Tt[qe]=(nt?"#"+nt:":scope")+" "+Fb(Tt[qe]);mt=Tt.join(",")}try{return D.apply(oe,_t.querySelectorAll(mt)),oe}catch{Gn(W,!0)}finally{nt===ye&&J.removeAttribute("id")}}}return XR(W.replace(B,"$1"),J,oe,he)}function Bb(){var W=[];function J(oe,he){return W.push(oe+" ")>b.cacheLength&&delete J[W.shift()],J[oe+" "]=he}return J}function So(W){return W[ye]=!0,W}function Ch(W){var J=z.createElement("fieldset");try{return!!W(J)}catch{return!1}finally{J.parentNode&&J.parentNode.removeChild(J),J=null}}function Bq(W){return function(J){return S(J,"input")&&J.type===W}}function Fq(W){return function(J){return(S(J,"input")||S(J,"button"))&&J.type===W}}function YR(W){return function(J){return"form"in J?J.parentNode&&J.disabled===!1?"label"in J?"label"in J.parentNode?J.parentNode.disabled===W:J.disabled===W:J.isDisabled===W||J.isDisabled!==!W&&Pq(J)===W:J.disabled===W:"label"in J?J.disabled===W:!1}}function yu(W){return So(function(J){return J=+J,So(function(oe,he){for(var xe,qe=W([],oe.length,J),Je=qe.length;Je--;)oe[xe=qe[Je]]&&(oe[xe]=!(he[xe]=oe[xe]))})})}function YC(W){return W&&typeof W.getElementsByTagName<"u"&&W}function ac(W){var J,oe=W?W.ownerDocument||W:K;return oe==z||oe.nodeType!==9||!oe.documentElement||(z=oe,G=z.documentElement,q=!_.isXMLDoc(z),fe=G.matches||G.webkitMatchesSelector||G.msMatchesSelector,G.msMatchesSelector&&K!=z&&(J=z.defaultView)&&J.top!==J&&J.addEventListener("unload",Oq),h.getById=Ch(function(he){return G.appendChild(he).id=_.expando,!z.getElementsByName||!z.getElementsByName(_.expando).length}),h.disconnectedMatch=Ch(function(he){return fe.call(he,"*")}),h.scope=Ch(function(){return z.querySelectorAll(":scope")}),h.cssHas=Ch(function(){try{return z.querySelector(":has(*,:jqfake)"),!1}catch{return!0}}),h.getById?(b.filter.ID=function(he){var xe=he.replace(Ga,Ya);return function(qe){return qe.getAttribute("id")===xe}},b.find.ID=function(he,xe){if(typeof xe.getElementById<"u"&&q){var qe=xe.getElementById(he);return qe?[qe]:[]}}):(b.filter.ID=function(he){var xe=he.replace(Ga,Ya);return function(qe){var Je=typeof qe.getAttributeNode<"u"&&qe.getAttributeNode("id");return Je&&Je.value===xe}},b.find.ID=function(he,xe){if(typeof xe.getElementById<"u"&&q){var qe,Je,nt,Ze=xe.getElementById(he);if(Ze){if(qe=Ze.getAttributeNode("id"),qe&&qe.value===he)return[Ze];for(nt=xe.getElementsByName(he),Je=0;Ze=nt[Je++];)if(qe=Ze.getAttributeNode("id"),qe&&qe.value===he)return[Ze]}return[]}}),b.find.TAG=function(he,xe){return typeof xe.getElementsByTagName<"u"?xe.getElementsByTagName(he):xe.querySelectorAll(he)},b.find.CLASS=function(he,xe){if(typeof xe.getElementsByClassName<"u"&&q)return xe.getElementsByClassName(he)},ie=[],Ch(function(he){var xe;G.appendChild(he).innerHTML="",he.querySelectorAll("[selected]").length||ie.push("\\["+N+"*(?:value|"+Qo+")"),he.querySelectorAll("[id~="+ye+"-]").length||ie.push("~="),he.querySelectorAll("a#"+ye+"+*").length||ie.push(".#.+[+~]"),he.querySelectorAll(":checked").length||ie.push(":checked"),xe=z.createElement("input"),xe.setAttribute("type","hidden"),he.appendChild(xe).setAttribute("name","D"),G.appendChild(he).disabled=!0,he.querySelectorAll(":disabled").length!==2&&ie.push(":enabled",":disabled"),xe=z.createElement("input"),xe.setAttribute("name",""),he.appendChild(xe),he.querySelectorAll("[name='']").length||ie.push("\\["+N+"*name"+N+"*="+N+`*(?:''|"")`)}),h.cssHas||ie.push(":has"),ie=ie.length&&new RegExp(ie.join("|")),wn=function(he,xe){if(he===xe)return k=!0,0;var qe=!he.compareDocumentPosition-!xe.compareDocumentPosition;return qe||(qe=(he.ownerDocument||he)==(xe.ownerDocument||xe)?he.compareDocumentPosition(xe):1,qe&1||!h.sortDetached&&xe.compareDocumentPosition(he)===qe?he===z||he.ownerDocument==K&&Zt.contains(K,he)?-1:xe===z||xe.ownerDocument==K&&Zt.contains(K,xe)?1:A?a.call(A,he)-a.call(A,xe):0:qe&4?-1:1)}),z}Zt.matches=function(W,J){return Zt(W,null,null,J)},Zt.matchesSelector=function(W,J){if(ac(W),q&&!Gn[J+" "]&&(!ie||!ie.test(J)))try{var oe=fe.call(W,J);if(oe||h.disconnectedMatch||W.document&&W.document.nodeType!==11)return oe}catch{Gn(J,!0)}return Zt(J,z,null,[W]).length>0},Zt.contains=function(W,J){return(W.ownerDocument||W)!=z&&ac(W),_.contains(W,J)},Zt.attr=function(W,J){(W.ownerDocument||W)!=z&&ac(W);var oe=b.attrHandle[J.toLowerCase()],he=oe&&u.call(b.attrHandle,J.toLowerCase())?oe(W,J,!q):void 0;return he!==void 0?he:W.getAttribute(J)},Zt.error=function(W){throw new Error("Syntax error, unrecognized expression: "+W)},_.uniqueSort=function(W){var J,oe=[],he=0,xe=0;if(k=!h.sortStable,A=!h.sortStable&&r.call(W,0),w.call(W,wn),k){for(;J=W[xe++];)J===W[xe]&&(he=oe.push(xe));for(;he--;)E.call(W,oe[he],1)}return A=null,W},_.fn.uniqueSort=function(){return this.pushStack(_.uniqueSort(r.apply(this)))},b=_.expr={cacheLength:50,createPseudo:So,match:ia,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(W){return W[1]=W[1].replace(Ga,Ya),W[3]=(W[3]||W[4]||W[5]||"").replace(Ga,Ya),W[2]==="~="&&(W[3]=" "+W[3]+" "),W.slice(0,4)},CHILD:function(W){return W[1]=W[1].toLowerCase(),W[1].slice(0,3)==="nth"?(W[3]||Zt.error(W[0]),W[4]=+(W[4]?W[5]+(W[6]||1):2*(W[3]==="even"||W[3]==="odd")),W[5]=+(W[7]+W[8]||W[3]==="odd")):W[3]&&Zt.error(W[0]),W},PSEUDO:function(W){var J,oe=!W[6]&&W[2];return ia.CHILD.test(W[0])?null:(W[3]?W[2]=W[4]||W[5]||"":oe&&ta.test(oe)&&(J=Up(oe,!0))&&(J=oe.indexOf(")",oe.length-J)-oe.length)&&(W[0]=W[0].slice(0,J),W[2]=oe.slice(0,J)),W.slice(0,3))}},filter:{TAG:function(W){var J=W.replace(Ga,Ya).toLowerCase();return W==="*"?function(){return!0}:function(oe){return S(oe,J)}},CLASS:function(W){var J=gt[W+" "];return J||(J=new RegExp("(^|"+N+")"+W+"("+N+"|$)"))&>(W,function(oe){return J.test(typeof oe.className=="string"&&oe.className||typeof oe.getAttribute<"u"&&oe.getAttribute("class")||"")})},ATTR:function(W,J,oe){return function(he){var xe=Zt.attr(he,W);return xe==null?J==="!=":J?(xe+="",J==="="?xe===oe:J==="!="?xe!==oe:J==="^="?oe&&xe.indexOf(oe)===0:J==="*="?oe&&xe.indexOf(oe)>-1:J==="$="?oe&&xe.slice(-oe.length)===oe:J==="~="?(" "+xe.replace(jt," ")+" ").indexOf(oe)>-1:J==="|="?xe===oe||xe.slice(0,oe.length+1)===oe+"-":!1):!0}},CHILD:function(W,J,oe,he,xe){var qe=W.slice(0,3)!=="nth",Je=W.slice(-4)!=="last",nt=J==="of-type";return he===1&&xe===0?function(Ze){return!!Ze.parentNode}:function(Ze,Tt,mt){var _t,Rt,ct,gi,Wr,lr=qe!==Je?"nextSibling":"previousSibling",zs=Ze.parentNode,na=nt&&Ze.nodeName.toLowerCase(),Sh=!mt&&!nt,xr=!1;if(zs){if(qe){for(;lr;){for(ct=Ze;ct=ct[lr];)if(nt?S(ct,na):ct.nodeType===1)return!1;Wr=lr=W==="only"&&!Wr&&"nextSibling"}return!0}if(Wr=[Je?zs.firstChild:zs.lastChild],Je&&Sh){for(Rt=zs[ye]||(zs[ye]={}),_t=Rt[W]||[],gi=_t[0]===ue&&_t[1],xr=gi&&_t[2],ct=gi&&zs.childNodes[gi];ct=++gi&&ct&&ct[lr]||(xr=gi=0)||Wr.pop();)if(ct.nodeType===1&&++xr&&ct===Ze){Rt[W]=[ue,gi,xr];break}}else if(Sh&&(Rt=Ze[ye]||(Ze[ye]={}),_t=Rt[W]||[],gi=_t[0]===ue&&_t[1],xr=gi),xr===!1)for(;(ct=++gi&&ct&&ct[lr]||(xr=gi=0)||Wr.pop())&&!((nt?S(ct,na):ct.nodeType===1)&&++xr&&(Sh&&(Rt=ct[ye]||(ct[ye]={}),Rt[W]=[ue,xr]),ct===Ze)););return xr-=xe,xr===he||xr%he===0&&xr/he>=0}}},PSEUDO:function(W,J){var oe,he=b.pseudos[W]||b.setFilters[W.toLowerCase()]||Zt.error("unsupported pseudo: "+W);return he[ye]?he(J):he.length>1?(oe=[W,W,"",J],b.setFilters.hasOwnProperty(W.toLowerCase())?So(function(xe,qe){for(var Je,nt=he(xe,J),Ze=nt.length;Ze--;)Je=a.call(xe,nt[Ze]),xe[Je]=!(qe[Je]=nt[Ze])}):function(xe){return he(xe,0,oe)}):he}},pseudos:{not:So(function(W){var J=[],oe=[],he=ZC(W.replace(B,"$1"));return he[ye]?So(function(xe,qe,Je,nt){for(var Ze,Tt=he(xe,null,nt,[]),mt=xe.length;mt--;)(Ze=Tt[mt])&&(xe[mt]=!(qe[mt]=Ze))}):function(xe,qe,Je){return J[0]=xe,he(J,null,Je,oe),J[0]=null,!oe.pop()}}),has:So(function(W){return function(J){return Zt(W,J).length>0}}),contains:So(function(W){return W=W.replace(Ga,Ya),function(J){return(J.textContent||_.text(J)).indexOf(W)>-1}}),lang:So(function(W){return $p.test(W||"")||Zt.error("unsupported lang: "+W),W=W.replace(Ga,Ya).toLowerCase(),function(J){var oe;do if(oe=q?J.lang:J.getAttribute("xml:lang")||J.getAttribute("lang"))return oe=oe.toLowerCase(),oe===W||oe.indexOf(W+"-")===0;while((J=J.parentNode)&&J.nodeType===1);return!1}}),target:function(W){var J=t.location&&t.location.hash;return J&&J.slice(1)===W.id},root:function(W){return W===G},focus:function(W){return W===zq()&&z.hasFocus()&&!!(W.type||W.href||~W.tabIndex)},enabled:YR(!1),disabled:YR(!0),checked:function(W){return S(W,"input")&&!!W.checked||S(W,"option")&&!!W.selected},selected:function(W){return W.parentNode&&W.parentNode.selectedIndex,W.selected===!0},empty:function(W){for(W=W.firstChild;W;W=W.nextSibling)if(W.nodeType<6)return!1;return!0},parent:function(W){return!b.pseudos.empty(W)},header:function(W){return oc.test(W.nodeName)},input:function(W){return sc.test(W.nodeName)},button:function(W){return S(W,"input")&&W.type==="button"||S(W,"button")},text:function(W){var J;return S(W,"input")&&W.type==="text"&&((J=W.getAttribute("type"))==null||J.toLowerCase()==="text")},first:yu(function(){return[0]}),last:yu(function(W,J){return[J-1]}),eq:yu(function(W,J,oe){return[oe<0?oe+J:oe]}),even:yu(function(W,J){for(var oe=0;oeJ?he=J:he=oe;--he>=0;)W.push(he);return W}),gt:yu(function(W,J,oe){for(var he=oe<0?oe+J:oe;++he1?function(J,oe,he){for(var xe=W.length;xe--;)if(!W[xe](J,oe,he))return!1;return!0}:W[0]}function Hq(W,J,oe){for(var he=0,xe=J.length;he-1&&(Je[mt]=!(nt[mt]=Rt))}}else ct=jb(ct===nt?ct.splice(lr,ct.length):ct),xe?xe(null,nt,ct,Tt):D.apply(nt,ct)})}function JC(W){for(var J,oe,he,xe=W.length,qe=b.relative[W[0].type],Je=qe||b.relative[" "],nt=qe?1:0,Ze=Hb(function(_t){return _t===J},Je,!0),Tt=Hb(function(_t){return a.call(J,_t)>-1},Je,!0),mt=[function(_t,Rt,ct){var gi=!qe&&(ct||Rt!=I)||((J=Rt).nodeType?Ze(_t,Rt,ct):Tt(_t,Rt,ct));return J=null,gi}];nt1&&KC(mt),nt>1&&Fb(W.slice(0,nt-1).concat({value:W[nt-2].type===" "?"*":""})).replace(B,"$1"),oe,nt0,he=W.length>0,xe=function(qe,Je,nt,Ze,Tt){var mt,_t,Rt,ct=0,gi="0",Wr=qe&&[],lr=[],zs=I,na=qe||he&&b.find.TAG("*",Tt),Sh=ue+=zs==null?1:Math.random()||.1,xr=na.length;for(Tt&&(I=Je==z||Je||Tt);gi!==xr&&(mt=na[gi])!=null;gi++){if(he&&mt){for(_t=0,!Je&&mt.ownerDocument!=z&&(ac(mt),nt=!q);Rt=W[_t++];)if(Rt(mt,Je||z,nt)){D.call(Ze,mt);break}Tt&&(ue=Sh)}oe&&((mt=!Rt&&mt)&&ct--,qe&&Wr.push(mt))}if(ct+=gi,oe&&gi!==ct){for(_t=0;Rt=J[_t++];)Rt(Wr,lr,Je,nt);if(qe){if(ct>0)for(;gi--;)Wr[gi]||lr[gi]||(lr[gi]=x.call(Ze));lr=jb(lr)}D.apply(Ze,lr),Tt&&!qe&&lr.length>0&&ct+J.length>1&&_.uniqueSort(Ze)}return Tt&&(ue=Sh,I=zs),Wr};return oe?So(xe):xe}function ZC(W,J){var oe,he=[],xe=[],qe=It[W+" "];if(!qe){for(J||(J=Up(W)),oe=J.length;oe--;)qe=JC(J[oe]),qe[ye]?he.push(qe):xe.push(qe);qe=It(W,jq(xe,he)),qe.selector=W}return qe}function XR(W,J,oe,he){var xe,qe,Je,nt,Ze,Tt=typeof W=="function"&&W,mt=!he&&Up(W=Tt.selector||W);if(oe=oe||[],mt.length===1){if(qe=mt[0]=mt[0].slice(0),qe.length>2&&(Je=qe[0]).type==="ID"&&J.nodeType===9&&q&&b.relative[qe[1].type]){if(J=(b.find.ID(Je.matches[0].replace(Ga,Ya),J)||[])[0],J)Tt&&(J=J.parentNode);else return oe;W=W.slice(qe.shift().value.length)}for(xe=ia.needsContext.test(W)?0:qe.length;xe--&&(Je=qe[xe],!b.relative[nt=Je.type]);)if((Ze=b.find[nt])&&(he=Ze(Je.matches[0].replace(Ga,Ya),GC.test(qe[0].type)&&YC(J.parentNode)||J))){if(qe.splice(xe,1),W=he.length&&Fb(qe),!W)return D.apply(oe,he),oe;break}}return(Tt||ZC(W,mt))(he,J,!q,oe,!J||GC.test(W)&&YC(J.parentNode)||J),oe}h.sortStable=ye.split("").sort(wn).join("")===ye,ac(),h.sortDetached=Ch(function(W){return W.compareDocumentPosition(z.createElement("fieldset"))&1}),_.find=Zt,_.expr[":"]=_.expr.pseudos,_.unique=_.uniqueSort,Zt.compile=ZC,Zt.select=XR,Zt.setDocument=ac,Zt.tokenize=Up,Zt.escape=_.escapeSelector,Zt.getText=_.text,Zt.isXML=_.isXMLDoc,Zt.selectors=_.expr,Zt.support=_.support,Zt.uniqueSort=_.uniqueSort})();var ne=function(g,b,I){for(var A=[],k=I!==void 0;(g=g[b])&&g.nodeType!==9;)if(g.nodeType===1){if(k&&_(g).is(I))break;A.push(g)}return A},_e=function(g,b){for(var I=[];g;g=g.nextSibling)g.nodeType===1&&g!==b&&I.push(g);return I},Pe=_.expr.match.needsContext,Ce=/^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function Ae(g,b,I){return m(b)?_.grep(g,function(A,k){return!!b.call(A,k,A)!==I}):b.nodeType?_.grep(g,function(A){return A===b!==I}):typeof b!="string"?_.grep(g,function(A){return a.call(b,A)>-1!==I}):_.filter(b,g,I)}_.filter=function(g,b,I){var A=b[0];return I&&(g=":not("+g+")"),b.length===1&&A.nodeType===1?_.find.matchesSelector(A,g)?[A]:[]:_.find.matches(g,_.grep(b,function(k){return k.nodeType===1}))},_.fn.extend({find:function(g){var b,I,A=this.length,k=this;if(typeof g!="string")return this.pushStack(_(g).filter(function(){for(b=0;b1?_.uniqueSort(I):I},filter:function(g){return this.pushStack(Ae(this,g||[],!1))},not:function(g){return this.pushStack(Ae(this,g||[],!0))},is:function(g){return!!Ae(this,typeof g=="string"&&Pe.test(g)?_(g):g||[],!1).length}});var ut,Xe=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/,tt=_.fn.init=function(g,b,I){var A,k;if(!g)return this;if(I=I||ut,typeof g=="string")if(g[0]==="<"&&g[g.length-1]===">"&&g.length>=3?A=[null,g,null]:A=Xe.exec(g),A&&(A[1]||!b))if(A[1]){if(b=b instanceof _?b[0]:b,_.merge(this,_.parseHTML(A[1],b&&b.nodeType?b.ownerDocument||b:v,!0)),Ce.test(A[1])&&_.isPlainObject(b))for(A in b)m(this[A])?this[A](b[A]):this.attr(A,b[A]);return this}else return k=v.getElementById(A[2]),k&&(this[0]=k,this.length=1),this;else return!b||b.jquery?(b||I).find(g):this.constructor(b).find(g);else{if(g.nodeType)return this[0]=g,this.length=1,this;if(m(g))return I.ready!==void 0?I.ready(g):g(_)}return _.makeArray(g,this)};tt.prototype=_.fn,ut=_(v);var ht=/^(?:parents|prev(?:Until|All))/,St={children:!0,contents:!0,next:!0,prev:!0};_.fn.extend({has:function(g){var b=_(g,this),I=b.length;return this.filter(function(){for(var A=0;A-1:I.nodeType===1&&_.find.matchesSelector(I,g))){D.push(I);break}}return this.pushStack(D.length>1?_.uniqueSort(D):D)},index:function(g){return g?typeof g=="string"?a.call(_(g),this[0]):a.call(this,g.jquery?g[0]:g):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(g,b){return this.pushStack(_.uniqueSort(_.merge(this.get(),_(g,b))))},addBack:function(g){return this.add(g==null?this.prevObject:this.prevObject.filter(g))}});function At(g,b){for(;(g=g[b])&&g.nodeType!==1;);return g}_.each({parent:function(g){var b=g.parentNode;return b&&b.nodeType!==11?b:null},parents:function(g){return ne(g,"parentNode")},parentsUntil:function(g,b,I){return ne(g,"parentNode",I)},next:function(g){return At(g,"nextSibling")},prev:function(g){return At(g,"previousSibling")},nextAll:function(g){return ne(g,"nextSibling")},prevAll:function(g){return ne(g,"previousSibling")},nextUntil:function(g,b,I){return ne(g,"nextSibling",I)},prevUntil:function(g,b,I){return ne(g,"previousSibling",I)},siblings:function(g){return _e((g.parentNode||{}).firstChild,g)},children:function(g){return _e(g.firstChild)},contents:function(g){return g.contentDocument!=null&&n(g.contentDocument)?g.contentDocument:(S(g,"template")&&(g=g.content||g),_.merge([],g.childNodes))}},function(g,b){_.fn[g]=function(I,A){var k=_.map(this,b,I);return g.slice(-5)!=="Until"&&(A=I),A&&typeof A=="string"&&(k=_.filter(A,k)),this.length>1&&(St[g]||_.uniqueSort(k),ht.test(g)&&k.reverse()),this.pushStack(k)}});var mi=/[^\x20\t\r\n\f]+/g;function Ei(g){var b={};return _.each(g.match(mi)||[],function(I,A){b[A]=!0}),b}_.Callbacks=function(g){g=typeof g=="string"?Ei(g):_.extend({},g);var b,I,A,k,D=[],z=[],G=-1,q=function(){for(k=k||g.once,A=b=!0;z.length;G=-1)for(I=z.shift();++G-1;)D.splice(ue,1),ue<=G&&G--}),this},has:function(fe){return fe?_.inArray(fe,D)>-1:D.length>0},empty:function(){return D&&(D=[]),this},disable:function(){return k=z=[],D=I="",this},disabled:function(){return!D},lock:function(){return k=z=[],!I&&!b&&(D=I=""),this},locked:function(){return!!k},fireWith:function(fe,ye){return k||(ye=ye||[],ye=[fe,ye.slice?ye.slice():ye],z.push(ye),b||q()),this},fire:function(){return ie.fireWith(this,arguments),this},fired:function(){return!!A}};return ie};function Ni(g){return g}function Se(g){throw g}function T(g,b,I,A){var k;try{g&&m(k=g.promise)?k.call(g).done(b).fail(I):g&&m(k=g.then)?k.call(g,b,I):b.apply(void 0,[g].slice(A))}catch(D){I.apply(void 0,[D])}}_.extend({Deferred:function(g){var b=[["notify","progress",_.Callbacks("memory"),_.Callbacks("memory"),2],["resolve","done",_.Callbacks("once memory"),_.Callbacks("once memory"),0,"resolved"],["reject","fail",_.Callbacks("once memory"),_.Callbacks("once memory"),1,"rejected"]],I="pending",A={state:function(){return I},always:function(){return k.done(arguments).fail(arguments),this},catch:function(D){return A.then(null,D)},pipe:function(){var D=arguments;return _.Deferred(function(z){_.each(b,function(G,q){var ie=m(D[q[4]])&&D[q[4]];k[q[1]](function(){var fe=ie&&ie.apply(this,arguments);fe&&m(fe.promise)?fe.promise().progress(z.notify).done(z.resolve).fail(z.reject):z[q[0]+"With"](this,ie?[fe]:arguments)})}),D=null}).promise()},then:function(D,z,G){var q=0;function ie(fe,ye,ue,Ne){return function(){var gt=this,zt=arguments,It=function(){var wn,Qo;if(!(fe=q&&(ue!==Se&&(gt=void 0,zt=[wn]),ye.rejectWith(gt,zt))}};fe?Gn():(_.Deferred.getErrorHook?Gn.error=_.Deferred.getErrorHook():_.Deferred.getStackHook&&(Gn.error=_.Deferred.getStackHook()),t.setTimeout(Gn))}}return _.Deferred(function(fe){b[0][3].add(ie(0,fe,m(G)?G:Ni,fe.notifyWith)),b[1][3].add(ie(0,fe,m(D)?D:Ni)),b[2][3].add(ie(0,fe,m(z)?z:Se))}).promise()},promise:function(D){return D!=null?_.extend(D,A):A}},k={};return _.each(b,function(D,z){var G=z[2],q=z[5];A[z[1]]=G.add,q&&G.add(function(){I=q},b[3-D][2].disable,b[3-D][3].disable,b[0][2].lock,b[0][3].lock),G.add(z[3].fire),k[z[0]]=function(){return k[z[0]+"With"](this===k?void 0:this,arguments),this},k[z[0]+"With"]=G.fireWith}),A.promise(k),g&&g.call(k,k),k},when:function(g){var b=arguments.length,I=b,A=Array(I),k=r.call(arguments),D=_.Deferred(),z=function(G){return function(q){A[G]=this,k[G]=arguments.length>1?r.call(arguments):q,--b||D.resolveWith(A,k)}};if(b<=1&&(T(g,D.done(z(I)).resolve,D.reject,!b),D.state()==="pending"||m(k[I]&&k[I].then)))return D.then();for(;I--;)T(k[I],z(I),D.reject);return D.promise()}});var H=/^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/;_.Deferred.exceptionHook=function(g,b){t.console&&t.console.warn&&g&&H.test(g.name)&&t.console.warn("jQuery.Deferred exception: "+g.message,g.stack,b)},_.readyException=function(g){t.setTimeout(function(){throw g})};var j=_.Deferred();_.fn.ready=function(g){return j.then(g).catch(function(b){_.readyException(b)}),this},_.extend({isReady:!1,readyWait:1,ready:function(g){(g===!0?--_.readyWait:_.isReady)||(_.isReady=!0,!(g!==!0&&--_.readyWait>0)&&j.resolveWith(v,[_]))}}),_.ready.then=j.then;function ee(){v.removeEventListener("DOMContentLoaded",ee),t.removeEventListener("load",ee),_.ready()}v.readyState==="complete"||v.readyState!=="loading"&&!v.documentElement.doScroll?t.setTimeout(_.ready):(v.addEventListener("DOMContentLoaded",ee),t.addEventListener("load",ee));var ce=function(g,b,I,A,k,D,z){var G=0,q=g.length,ie=I==null;if(M(I)==="object"){k=!0;for(G in I)ce(g,b,G,I[G],!0,D,z)}else if(A!==void 0&&(k=!0,m(A)||(z=!0),ie&&(z?(b.call(g,A),b=null):(ie=b,b=function(fe,ye,ue){return ie.call(_(fe),ue)})),b))for(;G1,null,!0)},removeData:function(g){return this.each(function(){U.remove(this,g)})}}),_.extend({queue:function(g,b,I){var A;if(g)return b=(b||"fx")+"queue",A=Ee.get(g,b),I&&(!A||Array.isArray(I)?A=Ee.access(g,b,_.makeArray(I)):A.push(I)),A||[]},dequeue:function(g,b){b=b||"fx";var I=_.queue(g,b),A=I.length,k=I.shift(),D=_._queueHooks(g,b),z=function(){_.dequeue(g,b)};k==="inprogress"&&(k=I.shift(),A--),k&&(b==="fx"&&I.unshift("inprogress"),delete D.stop,k.call(g,z,D)),!A&&D&&D.empty.fire()},_queueHooks:function(g,b){var I=b+"queueHooks";return Ee.get(g,I)||Ee.access(g,I,{empty:_.Callbacks("once memory").add(function(){Ee.remove(g,[b+"queue",I])})})}}),_.fn.extend({queue:function(g,b){var I=2;return typeof g!="string"&&(b=g,g="fx",I--),arguments.length\x20\t\r\n\f]*)/i,Ua=/^$|^module$|\/(?:java|ecma)script/i;(function(){var g=v.createDocumentFragment(),b=g.appendChild(v.createElement("div")),I=v.createElement("input");I.setAttribute("type","radio"),I.setAttribute("checked","checked"),I.setAttribute("name","t"),b.appendChild(I),h.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,b.innerHTML="",h.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue,b.innerHTML="",h.option=!!b.lastChild})();var Vn={thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};Vn.tbody=Vn.tfoot=Vn.colgroup=Vn.caption=Vn.thead,Vn.th=Vn.td,h.option||(Vn.optgroup=Vn.option=[1,""]);function xn(g,b){var I;return typeof g.getElementsByTagName<"u"?I=g.getElementsByTagName(b||"*"):typeof g.querySelectorAll<"u"?I=g.querySelectorAll(b||"*"):I=[],b===void 0||b&&S(g,b)?_.merge([g],I):I}function Mp(g,b){for(var I=0,A=g.length;I-1){k&&k.push(D);continue}if(ie=$i(D),z=xn(ye.appendChild(D),"script"),ie&&Mp(z),I)for(fe=0;D=z[fe++];)Ua.test(D.type||"")&&I.push(D)}return ye}var Ap=/^([^.]*)(?:\.(.+)|)/;function tc(){return!0}function ic(){return!1}function Tp(g,b,I,A,k,D){var z,G;if(typeof b=="object"){typeof I!="string"&&(A=A||I,I=void 0);for(G in b)Tp(g,G,I,A,b[G],D);return g}if(A==null&&k==null?(k=I,A=I=void 0):k==null&&(typeof I=="string"?(k=A,A=void 0):(k=A,A=I,I=void 0)),k===!1)k=ic;else if(!k)return g;return D===1&&(z=k,k=function(q){return _().off(q),z.apply(this,arguments)},k.guid=z.guid||(z.guid=_.guid++)),g.each(function(){_.event.add(this,b,k,A,I)})}_.event={global:{},add:function(g,b,I,A,k){var D,z,G,q,ie,fe,ye,ue,Ne,gt,zt,It=Ee.get(g);if(ii(g))for(I.handler&&(D=I,I=D.handler,k=D.selector),k&&_.find.matchesSelector(pt,k),I.guid||(I.guid=_.guid++),(q=It.events)||(q=It.events=Object.create(null)),(z=It.handle)||(z=It.handle=function(Gn){return typeof _<"u"&&_.event.triggered!==Gn.type?_.event.dispatch.apply(g,arguments):void 0}),b=(b||"").match(mi)||[""],ie=b.length;ie--;)G=Ap.exec(b[ie])||[],Ne=zt=G[1],gt=(G[2]||"").split(".").sort(),Ne&&(ye=_.event.special[Ne]||{},Ne=(k?ye.delegateType:ye.bindType)||Ne,ye=_.event.special[Ne]||{},fe=_.extend({type:Ne,origType:zt,data:A,handler:I,guid:I.guid,selector:k,needsContext:k&&_.expr.match.needsContext.test(k),namespace:gt.join(".")},D),(ue=q[Ne])||(ue=q[Ne]=[],ue.delegateCount=0,(!ye.setup||ye.setup.call(g,A,gt,z)===!1)&&g.addEventListener&&g.addEventListener(Ne,z)),ye.add&&(ye.add.call(g,fe),fe.handler.guid||(fe.handler.guid=I.guid)),k?ue.splice(ue.delegateCount++,0,fe):ue.push(fe),_.event.global[Ne]=!0)},remove:function(g,b,I,A,k){var D,z,G,q,ie,fe,ye,ue,Ne,gt,zt,It=Ee.hasData(g)&&Ee.get(g);if(!(!It||!(q=It.events))){for(b=(b||"").match(mi)||[""],ie=b.length;ie--;){if(G=Ap.exec(b[ie])||[],Ne=zt=G[1],gt=(G[2]||"").split(".").sort(),!Ne){for(Ne in q)_.event.remove(g,Ne+b[ie],I,A,!0);continue}for(ye=_.event.special[Ne]||{},Ne=(A?ye.delegateType:ye.bindType)||Ne,ue=q[Ne]||[],G=G[2]&&new RegExp("(^|\\.)"+gt.join("\\.(?:.*\\.|)")+"(\\.|$)"),z=D=ue.length;D--;)fe=ue[D],(k||zt===fe.origType)&&(!I||I.guid===fe.guid)&&(!G||G.test(fe.namespace))&&(!A||A===fe.selector||A==="**"&&fe.selector)&&(ue.splice(D,1),fe.selector&&ue.delegateCount--,ye.remove&&ye.remove.call(g,fe));z&&!ue.length&&((!ye.teardown||ye.teardown.call(g,gt,It.handle)===!1)&&_.removeEvent(g,Ne,It.handle),delete q[Ne])}_.isEmptyObject(q)&&Ee.remove(g,"handle events")}},dispatch:function(g){var b,I,A,k,D,z,G=new Array(arguments.length),q=_.event.fix(g),ie=(Ee.get(this,"events")||Object.create(null))[q.type]||[],fe=_.event.special[q.type]||{};for(G[0]=q,b=1;b=1)){for(;ie!==this;ie=ie.parentNode||this)if(ie.nodeType===1&&!(g.type==="click"&&ie.disabled===!0)){for(D=[],z={},I=0;I-1:_.find(k,this,null,[ie]).length),z[k]&&D.push(A);D.length&&G.push({elem:ie,handlers:D})}}return ie=this,q\s*$/g;function Rb(g,b){return S(g,"table")&&S(b.nodeType!==11?b:b.firstChild,"tr")&&_(g).children("tbody")[0]||g}function DC(g){return g.type=(g.getAttribute("type")!==null)+"/"+g.type,g}function kb(g){return(g.type||"").slice(0,5)==="true/"?g.type=g.type.slice(5):g.removeAttribute("type"),g}function Nb(g,b){var I,A,k,D,z,G,q;if(b.nodeType===1){if(Ee.hasData(g)&&(D=Ee.get(g),q=D.events,q)){Ee.remove(b,"handle events");for(k in q)for(I=0,A=q[k].length;I1&&typeof Ne=="string"&&!h.checkClone&&kp.test(Ne))return g.each(function(zt){var It=g.eq(zt);gt&&(b[0]=Ne.call(this,zt,It.html())),Jo(It,b,I,A)});if(ye&&(k=Ab(b,g[0].ownerDocument,!1,g,A),D=k.firstChild,k.childNodes.length===1&&(k=D),D||A)){for(z=_.map(xn(k,"script"),DC),G=z.length;fe0&&Mp(z,!q&&xn(g,"script")),G},cleanData:function(g){for(var b,I,A,k=_.event.special,D=0;(I=g[D])!==void 0;D++)if(ii(I)){if(b=I[Ee.expando]){if(b.events)for(A in b.events)k[A]?_.event.remove(I,A):_.removeEvent(I,A,b.handle);I[Ee.expando]=void 0}I[U.expando]&&(I[U.expando]=void 0)}}}),_.fn.extend({detach:function(g){return mh(this,g,!0)},remove:function(g){return mh(this,g)},text:function(g){return ce(this,function(b){return b===void 0?_.text(this):this.empty().each(function(){(this.nodeType===1||this.nodeType===11||this.nodeType===9)&&(this.textContent=b)})},null,g,arguments.length)},append:function(){return Jo(this,arguments,function(g){if(this.nodeType===1||this.nodeType===11||this.nodeType===9){var b=Rb(this,g);b.appendChild(g)}})},prepend:function(){return Jo(this,arguments,function(g){if(this.nodeType===1||this.nodeType===11||this.nodeType===9){var b=Rb(this,g);b.insertBefore(g,b.firstChild)}})},before:function(){return Jo(this,arguments,function(g){this.parentNode&&this.parentNode.insertBefore(g,this)})},after:function(){return Jo(this,arguments,function(g){this.parentNode&&this.parentNode.insertBefore(g,this.nextSibling)})},empty:function(){for(var g,b=0;(g=this[b])!=null;b++)g.nodeType===1&&(_.cleanData(xn(g,!1)),g.textContent="");return this},clone:function(g,b){return g=g??!1,b=b??g,this.map(function(){return _.clone(this,g,b)})},html:function(g){return ce(this,function(b){var I=this[0]||{},A=0,k=this.length;if(b===void 0&&I.nodeType===1)return I.innerHTML;if(typeof b=="string"&&!Rp.test(b)&&!Vn[(Ds.exec(b)||["",""])[1].toLowerCase()]){b=_.htmlPrefilter(b);try{for(;A=0&&(q+=Math.max(0,Math.ceil(g["offset"+b[0].toUpperCase()+b.slice(1)]-D-q-G-.5))||0),q+ie}function gu(g,b,I){var A=gh(g),k=!h.boxSizingReliable()||I,D=k&&_.css(g,"boxSizing",!1,A)==="border-box",z=D,G=pu(g,b,A),q="offset"+b[0].toUpperCase()+b.slice(1);if(ph.test(G)){if(!I)return G;G="auto"}return(!h.boxSizingReliable()&&D||!h.reliableTrDimensions()&&S(g,"tr")||G==="auto"||!parseFloat(G)&&_.css(g,"display",!1,A)==="inline")&&g.getClientRects().length&&(D=_.css(g,"boxSizing",!1,A)==="border-box",z=q in g,z&&(G=g[q])),G=parseFloat(G)||0,G+Oi(g,b,I||(D?"border":"content"),z,A,G)+"px"}_.extend({cssHooks:{opacity:{get:function(g,b){if(b){var I=pu(g,"opacity");return I===""?"1":I}}}},cssNumber:{animationIterationCount:!0,aspectRatio:!0,borderImageSlice:!0,columnCount:!0,flexGrow:!0,flexShrink:!0,fontWeight:!0,gridArea:!0,gridColumn:!0,gridColumnEnd:!0,gridColumnStart:!0,gridRow:!0,gridRowEnd:!0,gridRowStart:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,scale:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,floodOpacity:!0,stopOpacity:!0,strokeMiterlimit:!0,strokeOpacity:!0},cssProps:{},style:function(g,b,I,A){if(!(!g||g.nodeType===3||g.nodeType===8||!g.style)){var k,D,z,G=Lt(b),q=Co.test(b),ie=g.style;if(q||(b=Dp(G)),z=_.cssHooks[b]||_.cssHooks[G],I!==void 0){if(D=typeof I,D==="string"&&(k=ft.exec(I))&&k[1]&&(I=Ot(g,b,k),D="number"),I==null||I!==I)return;D==="number"&&!q&&(I+=k&&k[3]||(_.cssNumber[G]?"":"px")),!h.clearCloneStyle&&I===""&&b.indexOf("background")===0&&(ie[b]="inherit"),(!z||!("set"in z)||(I=z.set(g,I,A))!==void 0)&&(q?ie.setProperty(b,I):ie[b]=I)}else return z&&"get"in z&&(k=z.get(g,!1,A))!==void 0?k:ie[b]}},css:function(g,b,I,A){var k,D,z,G=Lt(b),q=Co.test(b);return q||(b=Dp(G)),z=_.cssHooks[b]||_.cssHooks[G],z&&"get"in z&&(k=z.get(g,!0,I)),k===void 0&&(k=pu(g,b,A)),k==="normal"&&b in Pb&&(k=Pb[b]),I===""||I?(D=parseFloat(k),I===!0||isFinite(D)?D||0:k):k}}),_.each(["height","width"],function(g,b){_.cssHooks[b]={get:function(I,A,k){if(A)return BC.test(_.css(I,"display"))&&(!I.getClientRects().length||!I.getBoundingClientRect().width)?Np(I,Op,function(){return gu(I,b,k)}):gu(I,b,k)},set:function(I,A,k){var D,z=gh(I),G=!h.scrollboxSize()&&z.position==="absolute",q=G||k,ie=q&&_.css(I,"boxSizing",!1,z)==="border-box",fe=k?Oi(I,b,k,ie,z):0;return ie&&G&&(fe-=Math.ceil(I["offset"+b[0].toUpperCase()+b.slice(1)]-parseFloat(z[b])-Oi(I,b,"border",!1,z)-.5)),fe&&(D=ft.exec(A))&&(D[3]||"px")!=="px"&&(I.style[b]=A,A=_.css(I,b)),nc(I,A,fe)}}}),_.cssHooks.marginLeft=Db(h.reliableMarginLeft,function(g,b){if(b)return(parseFloat(pu(g,"marginLeft"))||g.getBoundingClientRect().left-Np(g,{marginLeft:0},function(){return g.getBoundingClientRect().left}))+"px"}),_.each({margin:"",padding:"",border:"Width"},function(g,b){_.cssHooks[g+b]={expand:function(I){for(var A=0,k={},D=typeof I=="string"?I.split(" "):[I];A<4;A++)k[g+Nt[A]+b]=D[A]||D[A-2]||D[0];return k}},g!=="margin"&&(_.cssHooks[g+b].set=nc)}),_.fn.extend({css:function(g,b){return ce(this,function(I,A,k){var D,z,G={},q=0;if(Array.isArray(A)){for(D=gh(I),z=A.length;q1)}});function Nn(g,b,I,A,k){return new Nn.prototype.init(g,b,I,A,k)}_.Tween=Nn,Nn.prototype={constructor:Nn,init:function(g,b,I,A,k,D){this.elem=g,this.prop=I,this.easing=k||_.easing._default,this.options=b,this.start=this.now=this.cur(),this.end=A,this.unit=D||(_.cssNumber[I]?"":"px")},cur:function(){var g=Nn.propHooks[this.prop];return g&&g.get?g.get(this):Nn.propHooks._default.get(this)},run:function(g){var b,I=Nn.propHooks[this.prop];return this.options.duration?this.pos=b=_.easing[this.easing](g,this.options.duration*g,0,1,this.options.duration):this.pos=b=g,this.now=(this.end-this.start)*b+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),I&&I.set?I.set(this):Nn.propHooks._default.set(this),this}},Nn.prototype.init.prototype=Nn.prototype,Nn.propHooks={_default:{get:function(g){var b;return g.elem.nodeType!==1||g.elem[g.prop]!=null&&g.elem.style[g.prop]==null?g.elem[g.prop]:(b=_.css(g.elem,g.prop,""),!b||b==="auto"?0:b)},set:function(g){_.fx.step[g.prop]?_.fx.step[g.prop](g):g.elem.nodeType===1&&(_.cssHooks[g.prop]||g.elem.style[Dp(g.prop)]!=null)?_.style(g.elem,g.prop,g.now+g.unit):g.elem[g.prop]=g.now}}},Nn.propHooks.scrollTop=Nn.propHooks.scrollLeft={set:function(g){g.elem.nodeType&&g.elem.parentNode&&(g.elem[g.prop]=g.now)}},_.easing={linear:function(g){return g},swing:function(g){return .5-Math.cos(g*Math.PI)/2},_default:"swing"},_.fx=Nn.prototype.init,_.fx.step={};var qa,vu,FC=/^(?:toggle|show|hide)$/,Pp=/queueHooks$/;function Zo(){vu&&(v.hidden===!1&&t.requestAnimationFrame?t.requestAnimationFrame(Zo):t.setTimeout(Zo,_.fx.interval),_.fx.tick())}function zp(){return t.setTimeout(function(){qa=void 0}),qa=Date.now()}function bu(g,b){var I,A=0,k={height:g};for(b=b?1:0;A<4;A+=2-b)I=Nt[A],k["margin"+I]=k["padding"+I]=g;return b&&(k.opacity=k.width=g),k}function _u(g,b,I){for(var A,k=(jr.tweeners[b]||[]).concat(jr.tweeners["*"]),D=0,z=k.length;D1)},removeAttr:function(g){return this.each(function(){_.removeAttr(this,g)})}}),_.extend({attr:function(g,b,I){var A,k,D=g.nodeType;if(!(D===3||D===8||D===2)){if(typeof g.getAttribute>"u")return _.prop(g,b,I);if((D!==1||!_.isXMLDoc(g))&&(k=_.attrHooks[b.toLowerCase()]||(_.expr.match.bool.test(b)?zb:void 0)),I!==void 0){if(I===null){_.removeAttr(g,b);return}return k&&"set"in k&&(A=k.set(g,I,b))!==void 0?A:(g.setAttribute(b,I+""),I)}return k&&"get"in k&&(A=k.get(g,b))!==null?A:(A=_.find.attr(g,b),A??void 0)}},attrHooks:{type:{set:function(g,b){if(!h.radioValue&&b==="radio"&&S(g,"input")){var I=g.value;return g.setAttribute("type",b),I&&(g.value=I),b}}}},removeAttr:function(g,b){var I,A=0,k=b&&b.match(mi);if(k&&g.nodeType===1)for(;I=k[A++];)g.removeAttribute(I)}}),zb={set:function(g,b,I){return b===!1?_.removeAttr(g,I):g.setAttribute(I,I),I}},_.each(_.expr.match.bool.source.match(/\w+/g),function(g,b){var I=rc[b]||_.find.attr;rc[b]=function(A,k,D){var z,G,q=k.toLowerCase();return D||(G=rc[q],rc[q]=z,z=I(A,k,D)!=null?q:null,rc[q]=G),z}});var WC=/^(?:input|select|textarea|button)$/i,$C=/^(?:a|area)$/i;_.fn.extend({prop:function(g,b){return ce(this,_.prop,g,b,arguments.length>1)},removeProp:function(g){return this.each(function(){delete this[_.propFix[g]||g]})}}),_.extend({prop:function(g,b,I){var A,k,D=g.nodeType;if(!(D===3||D===8||D===2))return(D!==1||!_.isXMLDoc(g))&&(b=_.propFix[b]||b,k=_.propHooks[b]),I!==void 0?k&&"set"in k&&(A=k.set(g,I,b))!==void 0?A:g[b]=I:k&&"get"in k&&(A=k.get(g,b))!==null?A:g[b]},propHooks:{tabIndex:{get:function(g){var b=_.find.attr(g,"tabindex");return b?parseInt(b,10):WC.test(g.nodeName)||$C.test(g.nodeName)&&g.href?0:-1}}},propFix:{for:"htmlFor",class:"className"}}),h.optSelected||(_.propHooks.selected={get:function(g){var b=g.parentNode;return b&&b.parentNode&&b.parentNode.selectedIndex,null},set:function(g){var b=g.parentNode;b&&(b.selectedIndex,b.parentNode&&b.parentNode.selectedIndex)}}),_.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){_.propFix[this.toLowerCase()]=this});function Va(g){var b=g.match(mi)||[];return b.join(" ")}function Os(g){return g.getAttribute&&g.getAttribute("class")||""}function Y(g){return Array.isArray(g)?g:typeof g=="string"?g.match(mi)||[]:[]}_.fn.extend({addClass:function(g){var b,I,A,k,D,z;return m(g)?this.each(function(G){_(this).addClass(g.call(this,G,Os(this)))}):(b=Y(g),b.length?this.each(function(){if(A=Os(this),I=this.nodeType===1&&" "+Va(A)+" ",I){for(D=0;D-1;)I=I.replace(" "+k+" "," ");z=Va(I),A!==z&&this.setAttribute("class",z)}}):this):this.attr("class","")},toggleClass:function(g,b){var I,A,k,D,z=typeof g,G=z==="string"||Array.isArray(g);return m(g)?this.each(function(q){_(this).toggleClass(g.call(this,q,Os(this),b),b)}):typeof b=="boolean"&&G?b?this.addClass(g):this.removeClass(g):(I=Y(g),this.each(function(){if(G)for(D=_(this),k=0;k-1)return!0;return!1}});var re=/\r/g;_.fn.extend({val:function(g){var b,I,A,k=this[0];return arguments.length?(A=m(g),this.each(function(D){var z;this.nodeType===1&&(A?z=g.call(this,D,_(this).val()):z=g,z==null?z="":typeof z=="number"?z+="":Array.isArray(z)&&(z=_.map(z,function(G){return G==null?"":G+""})),b=_.valHooks[this.type]||_.valHooks[this.nodeName.toLowerCase()],(!b||!("set"in b)||b.set(this,z,"value")===void 0)&&(this.value=z))})):k?(b=_.valHooks[k.type]||_.valHooks[k.nodeName.toLowerCase()],b&&"get"in b&&(I=b.get(k,"value"))!==void 0?I:(I=k.value,typeof I=="string"?I.replace(re,""):I??"")):void 0}}),_.extend({valHooks:{option:{get:function(g){var b=_.find.attr(g,"value");return b??Va(_.text(g))}},select:{get:function(g){var b,I,A,k=g.options,D=g.selectedIndex,z=g.type==="select-one",G=z?null:[],q=z?D+1:k.length;for(D<0?A=q:A=z?D:0;A-1)&&(I=!0);return I||(g.selectedIndex=-1),D}}}}),_.each(["radio","checkbox"],function(){_.valHooks[this]={set:function(g,b){if(Array.isArray(b))return g.checked=_.inArray(_(g).val(),b)>-1}},h.checkOn||(_.valHooks[this].get=function(g){return g.getAttribute("value")===null?"on":g.value})});var ae=t.location,Ie={guid:Date.now()},Ve=/\?/;_.parseXML=function(g){var b,I;if(!g||typeof g!="string")return null;try{b=new t.DOMParser().parseFromString(g,"text/xml")}catch{}return I=b&&b.getElementsByTagName("parsererror")[0],(!b||I)&&_.error("Invalid XML: "+(I?_.map(I.childNodes,function(A){return A.textContent}).join(` -`):g)),b};var Ge=/^(?:focusinfocus|focusoutblur)$/,it=function(g){g.stopPropagation()};_.extend(_.event,{trigger:function(g,b,I,A){var k,D,z,G,q,ie,fe,ye,ue=[I||v],Ne=u.call(g,"type")?g.type:g,gt=u.call(g,"namespace")?g.namespace.split("."):[];if(D=ye=z=I=I||v,!(I.nodeType===3||I.nodeType===8)&&!Ge.test(Ne+_.event.triggered)&&(Ne.indexOf(".")>-1&&(gt=Ne.split("."),Ne=gt.shift(),gt.sort()),q=Ne.indexOf(":")<0&&"on"+Ne,g=g[_.expando]?g:new _.Event(Ne,typeof g=="object"&&g),g.isTrigger=A?2:3,g.namespace=gt.join("."),g.rnamespace=g.namespace?new RegExp("(^|\\.)"+gt.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,g.result=void 0,g.target||(g.target=I),b=b==null?[g]:_.makeArray(b,[g]),fe=_.event.special[Ne]||{},!(!A&&fe.trigger&&fe.trigger.apply(I,b)===!1))){if(!A&&!fe.noBubble&&!p(I)){for(G=fe.delegateType||Ne,Ge.test(G+Ne)||(D=D.parentNode);D;D=D.parentNode)ue.push(D),z=D;z===(I.ownerDocument||v)&&ue.push(z.defaultView||z.parentWindow||t)}for(k=0;(D=ue[k++])&&!g.isPropagationStopped();)ye=D,g.type=k>1?G:fe.bindType||Ne,ie=(Ee.get(D,"events")||Object.create(null))[g.type]&&Ee.get(D,"handle"),ie&&ie.apply(D,b),ie=q&&D[q],ie&&ie.apply&&ii(D)&&(g.result=ie.apply(D,b),g.result===!1&&g.preventDefault());return g.type=Ne,!A&&!g.isDefaultPrevented()&&(!fe._default||fe._default.apply(ue.pop(),b)===!1)&&ii(I)&&q&&m(I[Ne])&&!p(I)&&(z=I[q],z&&(I[q]=null),_.event.triggered=Ne,g.isPropagationStopped()&&ye.addEventListener(Ne,it),I[Ne](),g.isPropagationStopped()&&ye.removeEventListener(Ne,it),_.event.triggered=void 0,z&&(I[q]=z)),g.result}},simulate:function(g,b,I){var A=_.extend(new _.Event,I,{type:g,isSimulated:!0});_.event.trigger(A,null,b)}}),_.fn.extend({trigger:function(g,b){return this.each(function(){_.event.trigger(g,b,this)})},triggerHandler:function(g,b){var I=this[0];if(I)return _.event.trigger(g,b,I,!0)}});var lt=/\[\]$/,ni=/\r?\n/g,Jt=/^(?:submit|button|image|reset|file)$/i,Ut=/^(?:input|select|textarea|keygen)/i;function Ui(g,b,I,A){var k;if(Array.isArray(b))_.each(b,function(D,z){I||lt.test(g)?A(g,z):Ui(g+"["+(typeof z=="object"&&z!=null?D:"")+"]",z,I,A)});else if(!I&&M(b)==="object")for(k in b)Ui(g+"["+k+"]",b[k],I,A);else A(g,b)}_.param=function(g,b){var I,A=[],k=function(D,z){var G=m(z)?z():z;A[A.length]=encodeURIComponent(D)+"="+encodeURIComponent(G??"")};if(g==null)return"";if(Array.isArray(g)||g.jquery&&!_.isPlainObject(g))_.each(g,function(){k(this.name,this.value)});else for(I in g)Ui(I,g[I],b,k);return A.join("&")},_.fn.extend({serialize:function(){return _.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var g=_.prop(this,"elements");return g?_.makeArray(g):this}).filter(function(){var g=this.type;return this.name&&!_(this).is(":disabled")&&Ut.test(this.nodeName)&&!Jt.test(g)&&(this.checked||!$a.test(g))}).map(function(g,b){var I=_(this).val();return I==null?null:Array.isArray(I)?_.map(I,function(A){return{name:b.name,value:A.replace(ni,`\r -`)}}):{name:b.name,value:I.replace(ni,`\r -`)}}).get()}});var Kt=/%20/g,Dn=/#.*$/,On=/([?&])_=[^&]*/,fn=/^(.*?):[ \t]*([^\r\n]*)$/mg,yn=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,Bp=/^(?:GET|HEAD)$/,Fp=/^\/\//,_h={},xh={},yh="*/".concat("*"),wh=v.createElement("a");wh.href=ae.href;function Hp(g){return function(b,I){typeof b!="string"&&(I=b,b="*");var A,k=0,D=b.toLowerCase().match(mi)||[];if(m(I))for(;A=D[k++];)A[0]==="+"?(A=A.slice(1)||"*",(g[A]=g[A]||[]).unshift(I)):(g[A]=g[A]||[]).push(I)}}function VR(g,b,I,A){var k={},D=g===xh;function z(G){var q;return k[G]=!0,_.each(g[G]||[],function(ie,fe){var ye=fe(b,I,A);if(typeof ye=="string"&&!D&&!k[ye])return b.dataTypes.unshift(ye),z(ye),!1;if(D)return!(q=ye)}),q}return z(b.dataTypes[0])||!k["*"]&&z("*")}function UC(g,b){var I,A,k=_.ajaxSettings.flatOptions||{};for(I in b)b[I]!==void 0&&((k[I]?g:A||(A={}))[I]=b[I]);return A&&_.extend(!0,g,A),g}function Aq(g,b,I){for(var A,k,D,z,G=g.contents,q=g.dataTypes;q[0]==="*";)q.shift(),A===void 0&&(A=g.mimeType||b.getResponseHeader("Content-Type"));if(A){for(k in G)if(G[k]&&G[k].test(A)){q.unshift(k);break}}if(q[0]in I)D=q[0];else{for(k in I){if(!q[0]||g.converters[k+" "+q[0]]){D=k;break}z||(z=k)}D=D||z}if(D)return D!==q[0]&&q.unshift(D),I[D]}function Tq(g,b,I,A){var k,D,z,G,q,ie={},fe=g.dataTypes.slice();if(fe[1])for(z in g.converters)ie[z.toLowerCase()]=g.converters[z];for(D=fe.shift();D;)if(g.responseFields[D]&&(I[g.responseFields[D]]=b),!q&&A&&g.dataFilter&&(b=g.dataFilter(b,g.dataType)),q=D,D=fe.shift(),D){if(D==="*")D=q;else if(q!=="*"&&q!==D){if(z=ie[q+" "+D]||ie["* "+D],!z){for(k in ie)if(G=k.split(" "),G[1]===D&&(z=ie[q+" "+G[0]]||ie["* "+G[0]],z)){z===!0?z=ie[k]:ie[k]!==!0&&(D=G[0],fe.unshift(G[1]));break}}if(z!==!0)if(z&&g.throws)b=z(b);else try{b=z(b)}catch(ye){return{state:"parsererror",error:z?ye:"No conversion from "+q+" to "+D}}}}return{state:"success",data:b}}_.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:ae.href,type:"GET",isLocal:yn.test(ae.protocol),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":yh,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/\bxml\b/,html:/\bhtml/,json:/\bjson\b/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":JSON.parse,"text xml":_.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(g,b){return b?UC(UC(g,_.ajaxSettings),b):UC(_.ajaxSettings,g)},ajaxPrefilter:Hp(_h),ajaxTransport:Hp(xh),ajax:function(g,b){typeof g=="object"&&(b=g,g=void 0),b=b||{};var I,A,k,D,z,G,q,ie,fe,ye,ue=_.ajaxSetup({},b),Ne=ue.context||ue,gt=ue.context&&(Ne.nodeType||Ne.jquery)?_(Ne):_.event,zt=_.Deferred(),It=_.Callbacks("once memory"),Gn=ue.statusCode||{},wn={},Qo={},ea="canceled",Dt={readyState:0,getResponseHeader:function(jt){var qi;if(q){if(!D)for(D={};qi=fn.exec(k);)D[qi[1].toLowerCase()+" "]=(D[qi[1].toLowerCase()+" "]||[]).concat(qi[2]);qi=D[jt.toLowerCase()+" "]}return qi==null?null:qi.join(", ")},getAllResponseHeaders:function(){return q?k:null},setRequestHeader:function(jt,qi){return q==null&&(jt=Qo[jt.toLowerCase()]=Qo[jt.toLowerCase()]||jt,wn[jt]=qi),this},overrideMimeType:function(jt){return q==null&&(ue.mimeType=jt),this},statusCode:function(jt){var qi;if(jt)if(q)Dt.always(jt[Dt.status]);else for(qi in jt)Gn[qi]=[Gn[qi],jt[qi]];return this},abort:function(jt){var qi=jt||ea;return I&&I.abort(qi),xu(0,qi),this}};if(zt.promise(Dt),ue.url=((g||ue.url||ae.href)+"").replace(Fp,ae.protocol+"//"),ue.type=b.method||b.type||ue.method||ue.type,ue.dataTypes=(ue.dataType||"*").toLowerCase().match(mi)||[""],ue.crossDomain==null){G=v.createElement("a");try{G.href=ue.url,G.href=G.href,ue.crossDomain=wh.protocol+"//"+wh.host!=G.protocol+"//"+G.host}catch{ue.crossDomain=!0}}if(ue.data&&ue.processData&&typeof ue.data!="string"&&(ue.data=_.param(ue.data,ue.traditional)),VR(_h,ue,b,Dt),q)return Dt;ie=_.event&&ue.global,ie&&_.active++===0&&_.event.trigger("ajaxStart"),ue.type=ue.type.toUpperCase(),ue.hasContent=!Bp.test(ue.type),A=ue.url.replace(Dn,""),ue.hasContent?ue.data&&ue.processData&&(ue.contentType||"").indexOf("application/x-www-form-urlencoded")===0&&(ue.data=ue.data.replace(Kt,"+")):(ye=ue.url.slice(A.length),ue.data&&(ue.processData||typeof ue.data=="string")&&(A+=(Ve.test(A)?"&":"?")+ue.data,delete ue.data),ue.cache===!1&&(A=A.replace(On,"$1"),ye=(Ve.test(A)?"&":"?")+"_="+Ie.guid+++ye),ue.url=A+ye),ue.ifModified&&(_.lastModified[A]&&Dt.setRequestHeader("If-Modified-Since",_.lastModified[A]),_.etag[A]&&Dt.setRequestHeader("If-None-Match",_.etag[A])),(ue.data&&ue.hasContent&&ue.contentType!==!1||b.contentType)&&Dt.setRequestHeader("Content-Type",ue.contentType),Dt.setRequestHeader("Accept",ue.dataTypes[0]&&ue.accepts[ue.dataTypes[0]]?ue.accepts[ue.dataTypes[0]]+(ue.dataTypes[0]!=="*"?", "+yh+"; q=0.01":""):ue.accepts["*"]);for(fe in ue.headers)Dt.setRequestHeader(fe,ue.headers[fe]);if(ue.beforeSend&&(ue.beforeSend.call(Ne,Dt,ue)===!1||q))return Dt.abort();if(ea="abort",It.add(ue.complete),Dt.done(ue.success),Dt.fail(ue.error),I=VR(xh,ue,b,Dt),!I)xu(-1,"No Transport");else{if(Dt.readyState=1,ie&>.trigger("ajaxSend",[Dt,ue]),q)return Dt;ue.async&&ue.timeout>0&&(z=t.setTimeout(function(){Dt.abort("timeout")},ue.timeout));try{q=!1,I.send(wn,xu)}catch(jt){if(q)throw jt;xu(-1,jt)}}function xu(jt,qi,Wp,VC){var ta,$p,ia,sc,oc,Ps=qi;q||(q=!0,z&&t.clearTimeout(z),I=void 0,k=VC||"",Dt.readyState=jt>0?4:0,ta=jt>=200&&jt<300||jt===304,Wp&&(sc=Aq(ue,Dt,Wp)),!ta&&_.inArray("script",ue.dataTypes)>-1&&_.inArray("json",ue.dataTypes)<0&&(ue.converters["text script"]=function(){}),sc=Tq(ue,sc,Dt,ta),ta?(ue.ifModified&&(oc=Dt.getResponseHeader("Last-Modified"),oc&&(_.lastModified[A]=oc),oc=Dt.getResponseHeader("etag"),oc&&(_.etag[A]=oc)),jt===204||ue.type==="HEAD"?Ps="nocontent":jt===304?Ps="notmodified":(Ps=sc.state,$p=sc.data,ia=sc.error,ta=!ia)):(ia=Ps,(jt||!Ps)&&(Ps="error",jt<0&&(jt=0))),Dt.status=jt,Dt.statusText=(qi||Ps)+"",ta?zt.resolveWith(Ne,[$p,Ps,Dt]):zt.rejectWith(Ne,[Dt,Ps,ia]),Dt.statusCode(Gn),Gn=void 0,ie&>.trigger(ta?"ajaxSuccess":"ajaxError",[Dt,ue,ta?$p:ia]),It.fireWith(Ne,[Dt,Ps]),ie&&(gt.trigger("ajaxComplete",[Dt,ue]),--_.active||_.event.trigger("ajaxStop")))}return Dt},getJSON:function(g,b,I){return _.get(g,b,I,"json")},getScript:function(g,b){return _.get(g,void 0,b,"script")}}),_.each(["get","post"],function(g,b){_[b]=function(I,A,k,D){return m(A)&&(D=D||k,k=A,A=void 0),_.ajax(_.extend({url:I,type:b,dataType:D,data:A,success:k},_.isPlainObject(I)&&I))}}),_.ajaxPrefilter(function(g){var b;for(b in g.headers)b.toLowerCase()==="content-type"&&(g.contentType=g.headers[b]||"")}),_._evalUrl=function(g,b,I){return _.ajax({url:g,type:"GET",dataType:"script",cache:!0,async:!1,global:!1,converters:{"text script":function(){}},dataFilter:function(A){_.globalEval(A,b,I)}})},_.fn.extend({wrapAll:function(g){var b;return this[0]&&(m(g)&&(g=g.call(this[0])),b=_(g,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){for(var I=this;I.firstElementChild;)I=I.firstElementChild;return I}).append(this)),this},wrapInner:function(g){return m(g)?this.each(function(b){_(this).wrapInner(g.call(this,b))}):this.each(function(){var b=_(this),I=b.contents();I.length?I.wrapAll(g):b.append(g)})},wrap:function(g){var b=m(g);return this.each(function(I){_(this).wrapAll(b?g.call(this,I):g)})},unwrap:function(g){return this.parent(g).not("body").each(function(){_(this).replaceWith(this.childNodes)}),this}}),_.expr.pseudos.hidden=function(g){return!_.expr.pseudos.visible(g)},_.expr.pseudos.visible=function(g){return!!(g.offsetWidth||g.offsetHeight||g.getClientRects().length)},_.ajaxSettings.xhr=function(){try{return new t.XMLHttpRequest}catch{}};var Rq={0:200,1223:204},jp=_.ajaxSettings.xhr();h.cors=!!jp&&"withCredentials"in jp,h.ajax=jp=!!jp,_.ajaxTransport(function(g){var b,I;if(h.cors||jp&&!g.crossDomain)return{send:function(A,k){var D,z=g.xhr();if(z.open(g.type,g.url,g.async,g.username,g.password),g.xhrFields)for(D in g.xhrFields)z[D]=g.xhrFields[D];g.mimeType&&z.overrideMimeType&&z.overrideMimeType(g.mimeType),!g.crossDomain&&!A["X-Requested-With"]&&(A["X-Requested-With"]="XMLHttpRequest");for(D in A)z.setRequestHeader(D,A[D]);b=function(G){return function(){b&&(b=I=z.onload=z.onerror=z.onabort=z.ontimeout=z.onreadystatechange=null,G==="abort"?z.abort():G==="error"?typeof z.status!="number"?k(0,"error"):k(z.status,z.statusText):k(Rq[z.status]||z.status,z.statusText,(z.responseType||"text")!=="text"||typeof z.responseText!="string"?{binary:z.response}:{text:z.responseText},z.getAllResponseHeaders()))}},z.onload=b(),I=z.onerror=z.ontimeout=b("error"),z.onabort!==void 0?z.onabort=I:z.onreadystatechange=function(){z.readyState===4&&t.setTimeout(function(){b&&I()})},b=b("abort");try{z.send(g.hasContent&&g.data||null)}catch(G){if(b)throw G}},abort:function(){b&&b()}}}),_.ajaxPrefilter(function(g){g.crossDomain&&(g.contents.script=!1)}),_.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/\b(?:java|ecma)script\b/},converters:{"text script":function(g){return _.globalEval(g),g}}}),_.ajaxPrefilter("script",function(g){g.cache===void 0&&(g.cache=!1),g.crossDomain&&(g.type="GET")}),_.ajaxTransport("script",function(g){if(g.crossDomain||g.scriptAttrs){var b,I;return{send:function(A,k){b=_("

2 Linear Quadratic Regulators

2.1Introduction

Up to this point, we have considered decision problems with finitely many states and actions. However, in many applications, states and actions may take on continuous values. For example, consider autonomous driving, controlling a robot’s joints, and automated manufacturing. How can we teach computers to solve these kinds of problems? This is the -task of continuous control.

Solving a Rubik’s Cube with a robot hand.

Figure 2.1:Solving a Rubik’s Cube with a robot hand.

Boston Dynamics’s Spot robot.

Figure 2.2:Boston Dynamics’s Spot robot.

Aside from the change in the state and action spaces, the general +task of continuous control.

Solving a Rubik’s Cube with a robot hand.

Figure 2.1:Solving a Rubik’s Cube with a robot hand.

Boston Dynamics’s Spot robot.

Figure 2.2:Boston Dynamics’s Spot robot.

Aside from the change in the state and action spaces, the general problem setup remains the same: we seek to construct an optimal policy that outputs actions to solve the desired task. We will see that many key ideas and algorithms, in particular dynamic programming algorithms, @@ -32,7 +32,7 @@ difficult than it may first seem: the position of the pencil varies continuously, and the state transitions governing the system, i.e. the laws of physics, are highly complex. This task is equivalent to the -classic control problem known as CartPole:

The state xR4\st \in \mathbb{R}^4 can be described by:

  1. the position of the cart;

  2. the velocity of the cart;

  3. the angle of the pole;

  4. the angular velocity of the pole.

We can control the cart by applying a horizontal force uR\act \in \mathbb{R}.

Goal: Stabilize the cart around an ideal state and action +classic control problem known as CartPole:

The state xR4\st \in \mathbb{R}^4 can be described by:

  1. the position of the cart;

  2. the velocity of the cart;

  3. the angle of the pole;

  4. the angular velocity of the pole.

We can control the cart by applying a horizontal force uR\act \in \mathbb{R}.

Goal: Stabilize the cart around an ideal state and action (x,u)(\st^\star, \act^\star).

2.2Optimal control

Recall that an MDP is defined by its state space S\mathcal{S}, action space A\mathcal{A}, state transitions PP, reward function rr, and discount factor γ or time horizon H\hor. These have equivalents in the control @@ -60,7 +60,7 @@ over H\hor timesteps. In this chapter, we will only consider deterministic, time-dependent policies π=(π0,,πH1)\pi = (\pi_0, \dots, \pi_{H-1}) where πh:SA\pi_h : \mathcal{S} \to \mathcal{A} for each -h[H]\hi \in [\hor].

2.2.1A first attempt: Discretization

Can we solve this problem using tools from the finite MDP setting? If S\mathcal{S} and A\mathcal{A} were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (Definition 1.11). This inspires us to try discretizing the problem.

Suppose S\mathcal{S} and A\mathcal{A} are bounded, that is, @@ -117,36 +117,36 @@ continuous structure in other ways? This leads us to the linear quadratic regulator.

2.3The Linear Quadratic Regulator

The optimal control problem Definition 2.1 seems highly complex in general. Is there a relevant simplification that we can analyze? The linear quadratic regulator (LQR) is a solvable case and a fundamental tool in control theory.

We will henceforth abbreviate “symmetric positive definite” as s.p.d. and “positive definite” as p.d.

It will be helpful to reintroduce the value function notation for a policy to denote the average cost it incurs. These will be instrumental in constructing the optimal policy via dynamic programming, as we did in Section 1.3.2 for MDPs.

2.4Optimality and the Riccati Equation

In this section, we’ll compute the optimal value function VhV^\star_h, Q-function QhQ^\star_h, @@ -154,19 +154,19 @@ in a very similar way to the DP algorithms in the MDP setting. Recall the definition of the optimal value function:

We will prove the striking fact that the solution has very simple structure: +\end{split}

Both of the definitions above assume deterministic policies. Otherwise we would have to take an expectation over actions drawn from the policy, i.e. uhπh(xh)\act_\hi \sim \pi_\hi (\st_\hi).

We will prove the striking fact that the solution has very simple structure: VhV_h^\star and QhQ^\star_h are upward-curved quadratics -and πh\pi_h^\star is linear and furthermore does not depend on the noise!

Now we’ve shown that Vh(x)=xPhx+phV^\star_\hi(\st) = \st^\top P_\hi \st + p_\hi, where PhP_\hi is s.p.d., @@ -290,24 +290,24 @@ policy.

2.4.1Expected state at time h\hi

How can we compute the expected state at time h\hi when acting according to the optimal policy? Let’s first express xh\st_\hi in a cleaner way in terms of the history. Note that having linear dynamics -makes it easy to expand terms backwards in time:

xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\begin{aligned} +makes it easy to expand terms backwards in time:

xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\begin{aligned} \st_\hi & = A \st_{\hi-1} + B \act_{\hi-1} + w_{\hi-1} \\ & = A (A\st_{\hi-2} + B \act_{\hi-2} + w_{\hi-2}) + B \act_{\hi-1} + w_{\hi-1} \\ & = \cdots \\ & = A^\hi \st_0 + \sum_{i=0}^{\hi-1} A^i (B \act_{\hi-i-1} + w_{\hi-i-1}). -\end{aligned}

Let’s consider the average state at this time, given all the past +\end{aligned}

Let’s consider the average state at this time, given all the past states and actions. Since we assume that E[wh]=0\E [w_\hi] = 0 (this is the zero vector in dd dimensions), when we take an expectation, the whw_\hi -term vanishes due to linearity, and so we’re left with

E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\E [\st_\hi \mid \st_{0:(\hi-1)}, \act_{0:(\hi-1)}] = A^\hi \st_0 + \sum_{i=0}^{\hi-1} A^i B \act_{\hi-i-1}.

This introdces the quantity ABKiA - B K_i, which shows up frequently in +term vanishes due to linearity, and so we’re left with

E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\E [\st_\hi \mid \st_{0:(\hi-1)}, \act_{0:(\hi-1)}] = A^\hi \st_0 + \sum_{i=0}^{\hi-1} A^i B \act_{\hi-i-1}.

This introdces the quantity ABKiA - B K_i, which shows up frequently in control theory. For example, one important question is: will xh\st_\hi remain bounded, or will it go to infinity as time goes on? To answer this, let’s imagine for simplicity that these KiK_is are equal (call this matrix KK). Then the expression above becomes (ABK)hx0(A-BK)^\hi \st_0. Now consider the maximum eigenvalue λmax\lambda_{\max} of ABKA - BK. If λmax>1|\lambda_{\max}| > 1, then there’s some nonzero initial state -xˉ0\bar \st_0, the corresponding eigenvector, for which

limh(ABK)hxˉ0=limhλmaxhxˉ0=.\lim_{\hi \to \infty} (A - BK)^\hi \bar \st_0 +xˉ0\bar \st_0, the corresponding eigenvector, for which

limh(ABK)hxˉ0=limhλmaxhxˉ0=.\lim_{\hi \to \infty} (A - BK)^\hi \bar \st_0 = \lim_{\hi \to \infty} \lambda_{\max}^\hi \bar \st_0 - = \infty.

Otherwise, if λmax<1|\lambda_{\max}| < 1, then it’s impossible for your original state to explode as dramatically.

2.5Extensions

We’ve now formulated an optimal solution for the time-homogeneous LQR + = \infty.

Otherwise, if λmax<1|\lambda_{\max}| < 1, then it’s impossible for your original state to explode as dramatically.

2.5Extensions

We’ve now formulated an optimal solution for the time-homogeneous LQR and computed the expected state under the optimal policy. However, real world tasks rarely have such simple dynamics, and we may wish to design more complex cost functions. In this section, we’ll consider more @@ -325,15 +325,15 @@ consider the case where the dynamics and cost function are time-dependent. Our analysis remains almost identical; in fact, we can simply add a time index to the matrices AA and BB that determine the -dynamics and the matrices QQ and RR that determine the cost.

The modified problem is now defined as follows:

The derivation of the optimal value functions and the optimal policy remains almost exactly the same, and we can modify the Riccati equation -accordingly:

\ No newline at end of file diff --git a/control.json b/control.json index 41f754c..89a7fb4 100644 --- a/control.json +++ b/control.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"6b86f35044831ffbe0cf07af5eee27ce5d28fea0397ecdc730ddaa67506611c3","slug":"control","location":"/control.md","dependencies":[],"frontmatter":{"title":"2 Linear Quadratic Regulators","numbering":{"all":{"enabled":true},"enumerator":{"template":"2.%s"}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"x"},"\\act":{"macro":"u"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","thumbnail":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","thumbnailOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp","exports":[{"format":"md","filename":"control.md","url":"/build/control-a8c1e7d39cf806d9a073317a2544cfca.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"msjSRrfHoG"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"2.1","key":"pAG2g0jT35"},{"type":"paragraph","position":{"start":{"line":23,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Up to this point, we have considered decision problems with finitely\nmany states and actions. However, in many applications, states and\nactions may take on continuous values. For example, consider autonomous\ndriving, controlling a robot’s joints, and automated manufacturing. How\ncan we teach computers to solve these kinds of problems? This is the\ntask of ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"eqRXq0uOE9"},{"type":"strong","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"c6d25Gklq5"}],"key":"wmzOjHK23m"},{"type":"text","value":".","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"oqdjBgNMM4"}],"key":"dJ9fVxb1zk"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","alt":"Solving a Rubik’s Cube with a robot hand.","data":{"altTextIsAutoGenerated":true},"key":"acJ665gO18","urlSource":"shared/rubiks_cube.jpg","urlOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"control_examples","identifier":"control_examples","html_id":"control-examples","enumerator":"2.1","children":[{"type":"text","value":"Figure ","key":"S4h8PiZHhM"},{"type":"text","value":"2.1","key":"b9jYDCZTOZ"},{"type":"text","value":":","key":"Gp8tLZRXp0"}],"template":"Figure %s:","key":"ktNVwHYI5k"},{"type":"text","value":"Solving a Rubik’s Cube with a robot hand.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"Tg3AcTA4oE"}],"key":"L9U05bRhzg"}],"key":"wf0b84U7UR"}],"label":"control_examples","identifier":"control_examples","enumerator":"2.1","html_id":"control-examples","key":"B5WeKXl67n"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.jpg","alt":"Boston Dynamics’s Spot robot.","data":{"altTextIsAutoGenerated":true},"key":"LyqJZPY2Dh","urlSource":"shared/boston_dynamics.jpg","urlOptimized":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"robot_hand","identifier":"robot_hand","html_id":"robot-hand","enumerator":"2.2","children":[{"type":"text","value":"Figure ","key":"mJohxuH3xR"},{"type":"text","value":"2.2","key":"dwfZAcnUGX"},{"type":"text","value":":","key":"qOAiXFDD0h"}],"template":"Figure %s:","key":"F5A21mvlW2"},{"type":"text","value":"Boston Dynamics’s Spot robot.","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"SR0VbSNAak"}],"key":"lmisgjrjQ4"}],"key":"YqHcN90qOU"}],"label":"robot_hand","identifier":"robot_hand","enumerator":"2.2","html_id":"robot-hand","key":"G2UhQ2FjsX"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":46,"column":1}},"children":[{"type":"text","value":"Aside from the change in the state and action spaces, the general\nproblem setup remains the same: we seek to construct an ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"DIV59LgF02"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"xo7MW6aXtj"}],"key":"Eo7hD0Hm1C"},{"type":"text","value":"\nthat outputs actions to solve the desired task. We will see that many\nkey ideas and algorithms, in particular dynamic programming algorithms,\ncarry over to this new setting.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Q2TSM8U3PM"}],"key":"Yynx68YVey"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"text","value":"This chapter introduces a fundamental tool to solve a simple class of\ncontinuous control problems: the ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"gOkYBzVJ5q"},{"type":"strong","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"HqlcwY3VZz"}],"key":"s5oVb31QH9"},{"type":"text","value":". We will\nthen extend this basic method to more complex settings.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"NbEETuEp4k"}],"key":"R7EgX9pG3K"},{"type":"proof","kind":"example","label":"cart_pole","identifier":"cart_pole","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"CartPole","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"VQsR6mh5fX"}],"key":"mCGs39WqVC"},{"type":"paragraph","position":{"start":{"line":55,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"text","value":"Try to balance a pencil on its point on a flat surface. It’s much more\ndifficult than it may first seem: the position of the pencil varies\ncontinuously, and the state transitions governing the system, i.e. the\nlaws of physics, are highly complex. This task is equivalent to the\nclassic control problem known as ","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"ZV6kEgYKPr"},{"type":"emphasis","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"children":[{"type":"text","value":"CartPole","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"KuFsFZoIgr"}],"key":"zmHG02KWGz"},{"type":"text","value":":","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"USm1NqFRL5"}],"key":"rBosmSeW2B"},{"type":"image","url":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.png","width":"200px","align":"center","key":"HLcqrgKQNe","urlSource":"shared/cart_pole.png","urlOptimized":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.webp"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"The state ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"RUaD9rPMAN"},{"type":"inlineMath","value":"\\st \\in \\mathbb{R}^4","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"html":"xR4\\st \\in \\mathbb{R}^4xR4","key":"v8CHpDa3Yw"},{"type":"text","value":" can be described by:","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"dMauhbQj1Q"}],"key":"Gdwvlxkuw3"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":67,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":67,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"the position of the cart;","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"FGE5rv0S4W"}],"key":"yM2U2UeTWT"}],"key":"cEurZO1yPS"},{"type":"listItem","spread":true,"position":{"start":{"line":69,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"text","value":"the velocity of the cart;","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"Yur1pDHN7E"}],"key":"lDK4JCb7xD"}],"key":"Jml0wC4EkM"},{"type":"listItem","spread":true,"position":{"start":{"line":71,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"the angle of the pole;","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"X7oIOx8TMZ"}],"key":"nryUb16GgB"}],"key":"iddhPkG6vf"},{"type":"listItem","spread":true,"position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"the angular velocity of the pole.","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"OdsQ6OgeuS"}],"key":"FUKHMyDXZt"}],"key":"izJHdVxxBL"}],"key":"o8kweSfulZ"},{"type":"paragraph","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"We can ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"EwHyTXJ6lK"},{"type":"emphasis","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"control","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"IdaB1YYLnB"}],"key":"TmT7qX32YK"},{"type":"text","value":" the cart by applying a horizontal force ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"j3u0TmPD3r"},{"type":"inlineMath","value":"\\act \\in \\mathbb{R}","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"html":"uR\\act \\in \\mathbb{R}uR","key":"F3qCCughQ2"},{"type":"text","value":".","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"XjN6YQEHL3"}],"key":"CWqPS5EJ5K"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Goal:","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"Br3VqH0bK7"}],"key":"KZTNQhnPqy"},{"type":"text","value":" Stabilize the cart around an ideal state and action\n","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"doVr9NZ4cQ"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"K1JDumLzhG"},{"type":"text","value":".","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"jaJtFZjjIZ"}],"key":"iZT3SleT9E"}],"enumerator":"2.1","html_id":"cart-pole","key":"vE03ZiJ7lA"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Optimal control","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"Qk9inMiao7"}],"identifier":"optimal-control","label":"Optimal control","html_id":"optimal-control","implicit":true,"enumerator":"2.2","key":"Mz2K7ghxXH"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"Recall that an MDP is defined by its state space ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"AcXFf5L3kf"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"lCJmHgZmDr"},{"type":"text","value":", action space\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"ZrWOsg3M8W"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"A\\mathcal{A}A","key":"fTsQjf9vuC"},{"type":"text","value":", state transitions ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"qe4ssN22GT"},{"type":"inlineMath","value":"P","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"PPP","key":"etkyYsy2SG"},{"type":"text","value":", reward function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"XHCWvEJlvq"},{"type":"inlineMath","value":"r","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"rrr","key":"tq0K7McjeN"},{"type":"text","value":", and discount factor\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"aOkToKTQ2U"},{"type":"text","value":"γ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"DkWclrfFpk"},{"type":"text","value":" or time horizon ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"k9tyDCsrtO"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"H\\horH","key":"sFegDuAs9T"},{"type":"text","value":". These have equivalents in the control\nsetting:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"OceKnJkjih"}],"key":"ZzmkITll04"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":88,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The state and action spaces are ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"Er1Vm7d1cz"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"A4Vi1Sc7zn"}],"key":"UlBBzCXYTp"},{"type":"text","value":" rather than finite.\nThat is, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"Zxv8krqLH9"},{"type":"inlineMath","value":"\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"SRnx\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}SRnx","key":"FPs2L56tvi"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ju1rGLeYeJ"},{"type":"inlineMath","value":"\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"ARnu\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}ARnu","key":"V21NiVqDlH"},{"type":"text","value":",\nwhere ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"EjOUUJLmrB"},{"type":"inlineMath","value":"n_\\st","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nxn_\\stnx","key":"bFKnAXzb2o"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"EhUkzfkfyy"},{"type":"inlineMath","value":"n_\\act","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nun_\\actnu","key":"bDMmoEAHPL"},{"type":"text","value":" are the corresponding dimensions of these\nspaces, i.e. the number of coordinates to specify a single state or\naction respectively.","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"hlL6qrsqGN"}],"key":"OODGFF8oPl"}],"key":"sgrewBZude"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"We call the state transitions the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"PaFsa28HXj"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"uwxQcxjkwF"}],"key":"tMkcWaOcgC"},{"type":"text","value":" of the system. In the\nmost general case, these might change across timesteps and also\ninclude some stochastic ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"MzMilDI74H"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"noise","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"jn8gbwVJ1u"}],"key":"s3tZk55ot3"},{"type":"text","value":" ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"SrPijSpyLS"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"YKok6nBhe6"},{"type":"text","value":" at each timestep. We\ndenote these dynamics as the function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"WqmyGVcmnc"},{"type":"inlineMath","value":"f_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fhf_\\hifh","key":"cOW6td0nax"},{"type":"text","value":" such that\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"gjweMboTDL"},{"type":"inlineMath","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"xh+1=fh(xh,uh,wh)\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)xh+1=fh(xh,uh,wh)","key":"Rti7fGHZqG"},{"type":"text","value":". Of course, we can\nsimplify to cases where the dynamics are ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"wl8XL0rbv9"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"deterministic/noise-free","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"UBZQuMgFqE"}],"key":"pFdSAqJNhV"},{"type":"text","value":"\n(no ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Hsuae0B6UK"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"KvxEPFcnCV"},{"type":"text","value":" term) and/or ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"cEObKsWqNr"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"BcTt9IvDaw"}],"key":"wLNf5QzA5i"},{"type":"text","value":" (the same function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"MRM2rXheCm"},{"type":"inlineMath","value":"f","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fff","key":"YYhG0Oczcu"},{"type":"text","value":"\nacross timesteps).","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EMcx9w4pm8"}],"key":"dHe9XJVcqd"}],"key":"MjKXtesClV"},{"type":"listItem","spread":true,"position":{"start":{"line":103,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":103,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"Instead of maximizing the reward function, we seek to minimize the\n","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"RkylCUsa2j"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"cost function","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"gbF2fKd6No"}],"key":"fovSTJBPfs"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"WNpBPfI61a"},{"type":"inlineMath","value":"c_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ch:S×ARc_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}ch:S×AR","key":"Mf5GOwvh6j"},{"type":"text","value":". Often, the cost\nfunction describes ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"WkdkDsTvuC"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"how far away","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"sH5uxtcVqz"}],"key":"NWAVss588S"},{"type":"text","value":" we are from a ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"eBGVnLQsI4"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"target\nstate-action pair","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"irTfwcEV0z"}],"key":"zNC98SRRiz"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"EpIXJ7RC09"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"fi7khm9jd4"},{"type":"text","value":". An important special\ncase is when the cost is ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"FMMD7koisY"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"st4johxy27"}],"key":"R7McO1onwV"},{"type":"text","value":"; that is, it remains the\nsame function ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"Qttns2WkfI"},{"type":"inlineMath","value":"c","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ccc","key":"O24qHA4lvj"},{"type":"text","value":" at each timestep ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"j9c4L9bXOi"},{"type":"inlineMath","value":"h","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"hhh","key":"f8jNWEGP1e"},{"type":"text","value":".","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"VqKvQAOrVF"}],"key":"gJSgbaNuX0"}],"key":"rkt9YfZALK"},{"type":"listItem","spread":true,"position":{"start":{"line":110,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":110,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"We seek to minimize the ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"xAZAiNOm9O"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"undiscounted","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"Rso8z3lQo3"}],"key":"TpMCCR9Wmo"},{"type":"text","value":" cost within a ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"pqYN1SpwZw"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"finite time\nhorizon","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"C0u5JQwcdO"}],"key":"NcoooQqq89"},{"type":"text","value":" ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"R9pEmkOCxy"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"H\\horH","key":"I9P772SDFr"},{"type":"text","value":". Note that we end an episode at the final state\n","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"ACx581DgjT"},{"type":"inlineMath","value":"\\st_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"xH\\st_\\horxH","key":"K2NzJZuqj2"},{"type":"text","value":" -- there is no ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"MDEAoeqqkL"},{"type":"inlineMath","value":"\\act_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"uH\\act_\\horuH","key":"ZegAuqhn00"},{"type":"text","value":", and so we denote the cost for\nthe final state as ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"SiC9E3jJNu"},{"type":"inlineMath","value":"c_\\hor(\\st_\\hor)","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"cH(xH)c_\\hor(\\st_\\hor)cH(xH)","key":"iyJDTwAhkp"},{"type":"text","value":".","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"ItJzvsripi"}],"key":"Ac6ZNgJb47"}],"key":"y67QXqbVrP"}],"key":"mMiII47Hxq"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"With all of these components, we can now formulate the ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"gJwcsT7HwH"},{"type":"strong","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"optimal control\nproblem:","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"dw0MkAqFlo"}],"key":"jxP8clQDpE"},{"type":"text","value":" ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"YnqdPQ7VnX"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"compute a policy to minimize the expected undiscounted cost\nover ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"EYJ6b0OoAA"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"H\\horH","key":"rnOndswua7"},{"type":"text","value":" timesteps.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"qZ5rxTH3KR"}],"key":"HzUuKYd7eR"},{"type":"text","value":" In this chapter, we will only consider\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"g1BFCxDIPw"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"deterministic, time-dependent","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"cEbJ4xx8ws"}],"key":"VJ9f64Tpju"},{"type":"text","value":" policies\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"wQHG7wz8b3"},{"type":"inlineMath","value":"\\pi = (\\pi_0, \\dots, \\pi_{H-1})","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"π=(π0,,πH1)\\pi = (\\pi_0, \\dots, \\pi_{H-1})π=(π0,,πH1)","key":"ol9sf8TsYg"},{"type":"text","value":" where ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"kBEIaKvcuf"},{"type":"inlineMath","value":"\\pi_h : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"πh:SA\\pi_h : \\mathcal{S} \\to \\mathcal{A}πh:SA","key":"N2Rkj9Wo0Y"},{"type":"text","value":" for each\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"evbQvEC0ek"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"uaBhZUPTMh"},{"type":"text","value":".","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"V2TiLzImWO"}],"key":"mazUoWCSEO"},{"type":"proof","kind":"definition","label":"optimal_control","identifier":"optimal_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"OkHbVaC4PB"}],"key":"KAKvFhYpZ1"},{"type":"math","value":"\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}","position":{"start":{"line":125,"column":1},"end":{"line":135,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1ch(xh,uh))+cH(xH)]wherexh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}π0,,πH1:SAminwhereE[(h=0H1ch(xh,uh))+cH(xH)]xh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise","enumerator":"2.1","key":"iAB8mBmoLt"}],"enumerator":"2.1","html_id":"optimal-control","key":"PJiCPII96s"},{"type":"heading","depth":3,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"A first attempt: Discretization","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"qTBaLis7jR"}],"identifier":"a-first-attempt-discretization","label":"A first attempt: Discretization","html_id":"a-first-attempt-discretization","implicit":true,"enumerator":"2.2.1","key":"kH7s4Xx8ms"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"Can we solve this problem using tools from the finite MDP setting? If\n","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"LDGgStrkFb"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"S\\mathcal{S}S","key":"HgA8vN7WPv"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"ETRUNKZh4p"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"A\\mathcal{A}A","key":"uxNoNX9n3B"},{"type":"text","value":" were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"r6jWbjJuVC"},{"type":"crossReference","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"Definition ","key":"YgxSpEPxTy"},{"type":"text","value":"1.11","key":"BRcZkbxHuz"}],"identifier":"pi_star_dp","label":"pi_star_dp","kind":"proof:definition","template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"O3ZVMHNddQ"},{"type":"text","value":").\nThis inspires us to try ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"FKTh2uBYM6"},{"type":"emphasis","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"discretizing","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"moxEMajiTh"}],"key":"ZlQbTWRbGQ"},{"type":"text","value":" the\nproblem.","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"t8JylAJ0Zw"}],"key":"OYQkX0yFRX"},{"type":"paragraph","position":{"start":{"line":145,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Suppose ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"ZIV0JqNEGv"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"cNVzLvKvaF"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"w7nwOYyaiP"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"pbYkQbqEXF"},{"type":"text","value":" are bounded, that is,\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"DMYvEzOVyl"},{"type":"inlineMath","value":"\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxxSxBx\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\stmaxxSxBx","key":"W3416pu3Fb"},{"type":"text","value":" and\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"g9YWtsnbP9"},{"type":"inlineMath","value":"\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxuAuBu\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\actmaxuAuBu","key":"Oa4t47dHOC"},{"type":"text","value":". To make ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"Ph4T8WLjho"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"JrZpp79qxw"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"wfdopmk6eI"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"d9KjH9eeKP"},{"type":"text","value":" finite,\nlet’s choose some small positive ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"dsCQ3yEm5K"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"pwc5CCT80j"},{"type":"text","value":", and simply round each\ncoordinate to the nearest multiple of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"F17xYByvRK"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"EeizK3PUdr"},{"type":"text","value":". For example, if\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"tKdQ0CVSO3"},{"type":"inlineMath","value":"\\epsilon = 0.01","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"ϵ=0.01\\epsilon = 0.01ϵ=0.01","key":"HJGa1eh1dC"},{"type":"text","value":", then we round each element of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"eoA4wt5vlh"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"x\\stx","key":"Mima85Hjuf"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"O2z2riEr7i"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"u\\actu","key":"lIGkW2RHiE"},{"type":"text","value":" to two\ndecimal spaces.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"lQGERf6FO7"}],"key":"Ul3Prf0dSI"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"However, the discretized ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"iGaXkKC9Lf"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{S}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~\\widetilde{\\mathcal{S}}S","key":"iioeIO2698"},{"type":"text","value":" and ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"VztqqCL8o1"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{A}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~\\widetilde{\\mathcal{A}}A","key":"Tap98Y7joL"},{"type":"text","value":" may be finite, but\nthey may be infeasibly large: we must divide ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"esEt44lsBH"},{"type":"emphasis","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"each dimension","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"DFebGfkX2M"}],"key":"CKgCZccHED"},{"type":"text","value":" into\nintervals of length ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"yyAQGeYpzr"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε\\varepsilonε","key":"gyDgDwZQ3c"},{"type":"text","value":", resulting in\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"OawEUNu6Yk"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~=(Bx/ε)nx|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}S=(Bx/ε)nx","key":"sojFBeASmT"},{"type":"text","value":" and\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"q4MooRVizo"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~=(Bu/ε)nu|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}A=(Bu/ε)nu","key":"S0jJKdgiPI"},{"type":"text","value":". To get a sense of how\nquickly this grows, consider ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"S5yfrjbHWs"},{"type":"inlineMath","value":"\\varepsilon = 0.01, n_\\st = n_\\act = 10","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε=0.01,nx=nu=10\\varepsilon = 0.01, n_\\st = n_\\act = 10ε=0.01,nx=nu=10","key":"Xm3K6N5bGE"},{"type":"text","value":".\nThen the number of elements in the transition matrix would be\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"kUBpHxNiHE"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~2A~=(10010)2(10010)=1060|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}S2A=(10010)2(10010)=1060","key":"SZUf5QYD7y"},{"type":"text","value":"! (That’s\na trillion trillion trillion trillion trillion.)","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"EssCd2O3Jr"}],"key":"erdoHwlwAN"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"text","value":"What properties of the problem could we instead make use of? Note that\nby discretizing the state and action spaces, we implicitly assumed that\nrounding each state or action vector by some tiny amount ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"AeWBQj5CNT"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"html":"ε\\varepsilonε","key":"t8EzyicMKu"},{"type":"text","value":"\nwouldn’t change the behavior of the system by much; namely, that the\ncost and dynamics were relatively ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"IqmNsWFavu"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"OxAruMoxLx"}],"key":"tjG12jYlGt"},{"type":"text","value":". Can we use this\ncontinuous structure in other ways? This leads us to the ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"c3qP640usa"},{"type":"strong","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"linear\nquadratic regulator","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"VlhqACh4tj"}],"key":"cfEj1t2w73"},{"type":"text","value":".","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"PYQRixftKF"}],"key":"GSTrmUUbiM"},{"type":"heading","depth":2,"position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"text","value":"The Linear Quadratic Regulator","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"key":"udYkbNHoPz"}],"label":"lqr","identifier":"lqr","html_id":"lqr","enumerator":"2.3","key":"HpuVEdXOem"},{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"The optimal control problem ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"ZAiXRyuZqt"},{"type":"crossReference","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Definition ","key":"G1eQyQ1ye4"},{"type":"text","value":"2.1","key":"Owe8lwHKo6"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"YSzSoXA0gI"},{"type":"text","value":" seems highly complex in general. Is there a relevant simplification that we can analyze?\nThe ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"BURcVWo4d7"},{"type":"strong","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"lyQDMn1dVH"}],"key":"tNj5IDeaNQ"},{"type":"text","value":" (LQR) is a solvable case and a fundamental tool in control theory.","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"dWIhHVzwYR"}],"key":"ZXZsH3RfPk"},{"type":"proof","kind":"definition","label":"lqr_definition","identifier":"lqr_definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The linear quadratic regulator","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"zJvPwJSRl4"}],"key":"YPrrniBVAe"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"The LQR problem is a special case of the ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"FCCAAPZiN1"},{"type":"crossReference","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"hfnTBSp6Qi"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"R1DDgmDpOt"},{"type":"text","value":" with ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"ubNVcI9QiE"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"linear dynamics","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"HXPk7XIB1N"}],"key":"tDZjg4wQFc"},{"type":"text","value":" and an ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"qqFERA1gmv"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic cost function","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"H6XOdc3s3Y"}],"key":"pPs9fPXQ4k"},{"type":"text","value":".\nSolving the LQR problem will additionally enable us to ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"TSgdUDgZzz"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"Ar28rlGq8y"}],"key":"mEcmsSfGIf"},{"type":"text","value":" more complex setups using ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"cBwEEvqJ6f"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"qes2xyF8Fo"}],"key":"xKJrxvufhW"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"ojs1lgxtmN"}],"key":"JR4u3Z3GGW"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"strong","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"Linear, time-homogeneous dynamics","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"o7g1AQmW6D"}],"key":"b0OGEYAYAu"},{"type":"text","value":": for each timestep ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"LN3kPhlhCn"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"UunYkfo0sr"},{"type":"text","value":",","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"NBkbDleCLy"}],"key":"LExV1yn4QP"},{"type":"math","value":"\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"html":"xh+1=f(xh,uh,wh)=Axh+Buh+whwhere whN(0,σ2I).\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}xh+1where wh=f(xh,uh,wh)=Axh+Buh+whN(0,σ2I).","enumerator":"2.2","key":"YhY5BVxxuh"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"BIfSafkKFq"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"whw_\\hiwh","key":"fxqMJXptOL"},{"type":"text","value":" is a spherical Gaussian ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"wcPvEmfSof"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"noise term","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"DsynYWr7eQ"}],"key":"YV9GyfxqKw"},{"type":"text","value":" that makes the dynamics random.\nSetting ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"zKWBZxAv9R"},{"type":"inlineMath","value":"\\sigma = 0","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"σ=0\\sigma = 0σ=0","key":"FW9Km1P4qH"},{"type":"text","value":" gives us ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"uRt9RmcOi7"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"UDO6lICi4b"}],"key":"fFXaqdTeJp"},{"type":"text","value":" state transitions.\nWe will find that the optimal policy actually ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"MAtNa7Rpc6"},{"type":"emphasis","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"does not depend on the noise","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"nEqWhRhdq4"}],"key":"fYE21Wz1J0"},{"type":"text","value":", although the optimal value function and Q-function do.","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"IpiRaHOFRa"}],"key":"sn7DTqCrnf"},{"type":"paragraph","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"strong","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"text","value":"Upward-curved quadratic, time-homogeneous cost function","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"rL6qNr40e2"}],"key":"ceATuxEoi6"},{"type":"text","value":":","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"iLs7bWxmBf"}],"key":"MptWspS1Uq"},{"type":"math","value":"c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.","position":{"start":{"line":198,"column":1},"end":{"line":203,"column":1}},"html":"c(xh,uh)={xhQxh+uhRuhh<HxhQxhh=H.c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.c(xh,uh)={xhQxh+uhRuhxhQxhh<Hh=H.","enumerator":"2.3","key":"uNnh8R4TAr"},{"type":"paragraph","position":{"start":{"line":205,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"This cost function attempts to stabilize the state and action about ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"NUQmwLLp14"},{"type":"inlineMath","value":"(s^\\star, a^\\star) = (0, 0)","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"(s,a)=(0,0)(s^\\star, a^\\star) = (0, 0)(s,a)=(0,0)","key":"ervf0eQidg"},{"type":"text","value":".\nWe require ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"EEeseaJbbw"},{"type":"inlineMath","value":"Q \\in \\R^{n_\\st \\times n_\\st}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"QRnx×nxQ \\in \\R^{n_\\st \\times n_\\st}QRnx×nx","key":"y8eSwq6Z25"},{"type":"text","value":" and ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"H3MnWkJs1a"},{"type":"inlineMath","value":"R \\in \\R^{n_\\act \\times n_\\act}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"RRnu×nuR \\in \\R^{n_\\act \\times n_\\act}RRnu×nu","key":"oou1Ojr5t9"},{"type":"text","value":" to both be ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"KG64p39gyS"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"positive definite","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"jneYjyMe1c"}],"key":"UeeADl6jov"},{"type":"text","value":" matrices so that ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"slcfjFaQQ0"},{"type":"inlineMath","value":"c","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"ccc","key":"Vv3QwdVzwP"},{"type":"text","value":" has a well-defined unique minimum.\nWe can furthermore assume without loss of generality that they are both ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"DFIaenMCT7"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"symmetric","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"cysqu4251R"}],"key":"l4p7llGHPn"},{"type":"text","value":" (see exercise below).","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"ngQQ52yrMc"}],"key":"XQFz6qwjtj"},{"type":"paragraph","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"This results in the LQR optimization problem:","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"v67iaxHIpv"}],"key":"J3QwkX8OxK"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}","position":{"start":{"line":211,"column":1},"end":{"line":219,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1xhQxh+uhRuh)+xHQxH]wherexh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}π0,,πH1:SAminwhereE[(h=0H1xhQxh+uhRuh)+xHQxH]xh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.","enumerator":"2.4","key":"QSiVLkupeS"}],"enumerator":"2.2","html_id":"lqr-definition","key":"AbSilNBdEs"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"iszxO0H3yH"}],"key":"KZZSSfRiwq"},{"type":"paragraph","position":{"start":{"line":223,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"Here we’ll show that we don’t lose generality by assuming that ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"LgLOWvSAEZ"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"YhvoGw5fFX"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"dKFT4bXqot"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"GVQenDVltb"},{"type":"text","value":" are symmetric.\nShow that replacing ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"dwM4zeavyA"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"lEoSlYZI11"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"PtKPFomxjt"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"y9Y5aDXbxi"},{"type":"text","value":" with ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"APrFIqYtrf"},{"type":"inlineMath","value":"(Q + Q^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(Q+Q)/2(Q + Q^\\top) / 2(Q+Q)/2","key":"BBOqjXGiD5"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"LIRE9rJnT1"},{"type":"inlineMath","value":"(R + R^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(R+R)/2(R + R^\\top) / 2(R+R)/2","key":"cXcgOYMWT0"},{"type":"text","value":" (which are symmetric) yields the same cost function.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"vQFFCSOxrG"}],"key":"zdXrFESCT8"}],"key":"lwmmf7F7J6"},{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"We will henceforth abbreviate “symmetric positive definite” as s.p.d.\nand “positive definite” as p.d.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"iBIRVFMoQa"}],"key":"ABUZis7GwO"},{"type":"paragraph","position":{"start":{"line":230,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"It will be helpful to reintroduce the ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"AlzCbC2Qgt"},{"type":"emphasis","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"UIWaUKwKYl"}],"key":"WFIlqrxqRY"},{"type":"text","value":" notation for a policy to denote the average cost it incurs.\nThese will be instrumental in constructing the optimal policy via ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"QPsn8TWKaK"},{"type":"strong","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"dynamic programming,","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"IClDIZEH5c"}],"key":"tdQEzxTuns"},{"type":"text","value":"\nas we did in ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"L9KdAJaKPx"},{"type":"crossReference","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Section ","key":"OShlppLZYi"},{"type":"text","value":"1.3.2","key":"juEZZICtwm"}],"identifier":"opt_dynamic_programming","label":"opt_dynamic_programming","kind":"heading","template":"Section %s","enumerator":"1.3.2","resolved":true,"html_id":"opt-dynamic-programming","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"gJcijK4lf9"},{"type":"text","value":" for MDPs.","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"vAMH1vflPO"}],"key":"JHKC7cvBqL"},{"type":"proof","kind":"definition","label":"value_lqr","identifier":"value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value functions for LQR","position":{"start":{"line":234,"column":1},"end":{"line":234,"column":1}},"key":"Rvwzs58i5U"}],"key":"MKZGhN1cTr"},{"type":"paragraph","position":{"start":{"line":237,"column":1},"end":{"line":238,"column":1}},"children":[{"type":"text","value":"Given a policy ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"KedvKGD6Yw"},{"type":"inlineMath","value":"\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"π=(π0,,πH1)\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})π=(π0,,πH1)","key":"atXMZzyhgp"},{"type":"text","value":",\nwe can define its value function ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"p9DMqqgdcc"},{"type":"inlineMath","value":"V^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"Vhπ:SRV^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}Vhπ:SR","key":"PN20BeDrVo"},{"type":"text","value":" at time ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"bJLaPOmwdN"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"D7cCPCyjbi"},{"type":"text","value":" as the average ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"NUlPGJDJSJ"},{"type":"strong","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"children":[{"type":"text","value":"cost-to-go","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"MaScklozVZ"}],"key":"ujhzf9R19s"},{"type":"text","value":" incurred by that policy:","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"O7tpJzi7Bh"}],"key":"aiOFGn5ZLE"},{"type":"math","value":"\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}","position":{"start":{"line":240,"column":1},"end":{"line":245,"column":1}},"html":"Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.5","key":"zZ68eHIidQ"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"The Q-function additionally conditions on the first action we take:","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"HOWNBF4sxi"}],"key":"s1LrhUeU5N"},{"type":"math","value":"\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":249,"column":1},"end":{"line":256,"column":1}},"html":"Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]","enumerator":"2.6","key":"RacrxZXaGb"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"Note that since we use ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"o3YzdMXxZf"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"cost","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"KGviS3gWVB"}],"key":"utvHBpPMbU"},{"type":"text","value":" instead of ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"f7sJu4lHcY"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"reward,","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"t2LaxE1AfG"}],"key":"yjxyLXg1EQ"},{"type":"text","value":"\nthe best policies are the ones with ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"PAjxOp1jIY"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"smaller","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"WLBcP1rT6R"}],"key":"cG1vdg6G0o"},{"type":"text","value":" values of the value function.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"stRS3GSMqi"}],"key":"J95nZ3uAqQ"}],"enumerator":"2.3","html_id":"value-lqr","key":"G9ZfSYYyzc"},{"type":"heading","depth":2,"position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"children":[{"type":"text","value":"Optimality and the Riccati Equation","position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"key":"v6vJqzbTac"}],"label":"optimal_lqr","identifier":"optimal_lqr","html_id":"optimal-lqr","enumerator":"2.4","key":"sYv18B7tg6"},{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"In this section,\nwe’ll compute the optimal value function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"qmtU9gefHP"},{"type":"inlineMath","value":"V^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"VhV^\\star_hVh","key":"e5HqrI957a"},{"type":"text","value":",\nQ-function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"A6abCcUKaG"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"QhQ^\\star_hQh","key":"vvpvHoXvUJ"},{"type":"text","value":",\nand policy ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"AfzZLpcXnz"},{"type":"inlineMath","value":"\\pi^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"πh\\pi^\\star_hπh","key":"zb0Q7hrWEM"},{"type":"text","value":" in ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"gW4RXeVLUG"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"the linear quadratic regulator","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"hyGXle6Jn1"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"HmO4J0Dqk0"},{"type":"text","value":" using ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"Hb3YzbAcB3"},{"type":"strong","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"nSM3vPIBbv"}],"key":"Opsiwq3Ozw"},{"type":"text","value":"\nin a very similar way to the DP algorithms ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"VnwPzYhiXV"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"gRpd67Eklg"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"W5odO4JhIJ"},{"type":"text","value":".\nRecall the definition of the optimal value function:","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"RwKDJGkeZH"}],"key":"NfPfstr9XD"},{"type":"proof","kind":"definition","label":"optimal_value_lqr","identifier":"optimal_value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"JG6ULbMAXe"}],"key":"qINuHUpqCj"},{"type":"paragraph","position":{"start":{"line":275,"column":1},"end":{"line":277,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"TlgczyLFqz"},{"type":"strong","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"u2VgE1KlUN"}],"key":"QzXJP6ldQ5"},{"type":"text","value":" is the one that,\nat any time and in any state,\nachieves ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"dsuoCDNNdf"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"minimum cost","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"PXzSWqL4vy"}],"key":"kp5Cr0qR4Q"},{"type":"text","value":" across ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"zb0D7CixoE"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"all policies","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"IxIaC2VIHO"}],"key":"mrcoiQwae1"},{"type":"text","value":":","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"PXTPMpP1so"}],"key":"drD8Rv5ExG"},{"type":"math","value":"\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":279,"column":1},"end":{"line":285,"column":1}},"html":"Vh(x)=minπh,,πH1Vhπ(x)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Vh(x)=πh,,πH1minVhπ(x)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.7","key":"yj97bWb9qz"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"text","value":"The optimal Q-function is defined similarly,\nconditioned on the starting action as well:","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"wcxrf5V6T9"}],"key":"j9kgM0U80q"},{"type":"math","value":"\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":290,"column":1},"end":{"line":296,"column":1}},"html":"Qh(x,u)=minπh,,πH1Qhπ(x,u)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}Qh(x,u)=πh,,πH1minQhπ(x,u)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]","enumerator":"2.8","key":"XukP0wEEon"},{"type":"paragraph","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"Both of the definitions above assume ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"uXhfVfZ2sO"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"UdZc6maosB"}],"key":"MVhRLhFgG1"},{"type":"text","value":" policies. Otherwise we would have to take an ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"kb9ceMluke"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"expectation","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"mNfdaE6jKw"}],"key":"nvkAV91KJy"},{"type":"text","value":" over actions drawn from the policy, i.e. ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"xe2k94lJLs"},{"type":"inlineMath","value":"\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"uhπh(xh)\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)uhπh(xh)","key":"wmwRPXkulA"},{"type":"text","value":".","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"usVG3Q5HpV"}],"key":"Rfkfl1xchE"}],"enumerator":"2.4","html_id":"optimal-value-lqr","key":"i3UYVWFmdH"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"text","value":"We will prove the striking fact that the solution has very simple structure:\n","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"RUPxqHaBkL"},{"type":"inlineMath","value":"V_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"VhV_h^\\starVh","key":"d8uEyvHPm3"},{"type":"text","value":" and ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"gCQj00NJrf"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"QhQ^\\star_hQh","key":"BmiLWEkJZw"},{"type":"text","value":" are ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"JWtCA6a7SZ"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"upward-curved quadratics","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"XgYPdHUTWI"}],"key":"N0klf7Zbyc"},{"type":"text","value":"\nand ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"FUNXzr3AR4"},{"type":"inlineMath","value":"\\pi_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"πh\\pi_h^\\starπh","key":"avLRGrUoxM"},{"type":"text","value":" is ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"gKe7ADrIX0"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"s4lhdkdNuN"}],"key":"M8JIyoKtJd"},{"type":"text","value":" and furthermore does not depend on the noise!","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"jy24gzd0g1"}],"key":"Dz2VNg8N9M"},{"type":"proof","kind":"theorem","label":"optimal_value_lqr_quadratic","identifier":"optimal_value_lqr_quadratic","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR is an upward-curved quadratic","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"qknRSzvGqr"}],"key":"fhHNgYit5u"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"bv8Dqbh1tJ"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"WQRTimzX0j"},{"type":"text","value":",","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"TftrjSCdBl"}],"key":"mXS5qTIZvZ"},{"type":"math","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":310,"column":1},"end":{"line":312,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","enumerator":"2.9","key":"Qadq6kkHLD"},{"type":"paragraph","position":{"start":{"line":314,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"for some s.p.d. matrix ","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"uuYNepnOo6"},{"type":"inlineMath","value":"P_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"PhRnx×nxP_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}PhRnx×nx","key":"rmZ1N1vHeG"},{"type":"text","value":" and scalar\n","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"ba658IJkkm"},{"type":"inlineMath","value":"p_\\hi \\in \\mathbb{R}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"phRp_\\hi \\in \\mathbb{R}phR","key":"TZfWb8yoiu"},{"type":"text","value":".","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"p6zpsA86Gy"}],"key":"htE9q8Sf3d"}],"enumerator":"2.1","html_id":"optimal-value-lqr-quadratic","key":"pyHuNgiw9T"},{"type":"proof","kind":"theorem","label":"optimal_policy_lqr_linear","identifier":"optimal_policy_lqr_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policy in LQR is linear","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"LcnjmPtewF"}],"key":"WQJMXut6uM"},{"type":"paragraph","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"fnVYGE0UoF"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"IGgQUCGuDN"},{"type":"text","value":",","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"ibf0mD8jHE"}],"key":"WpiQ9x7fFu"},{"type":"math","value":"\\pi^\\star_\\hi (\\st) = - K_\\hi \\st","position":{"start":{"line":323,"column":1},"end":{"line":325,"column":1}},"html":"πh(x)=Khx\\pi^\\star_\\hi (\\st) = - K_\\hi \\stπh(x)=Khx","enumerator":"2.10","key":"ujzydQVZpm"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":328,"column":1}},"children":[{"type":"text","value":"for some ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"RgB7d6654U"},{"type":"inlineMath","value":"K_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"KhRnu×nxK_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}KhRnu×nx","key":"TixrspvzoX"},{"type":"text","value":".\n(The negative is due to convention.)","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"byoAyy5uex"}],"key":"QeffyWb6Sb"}],"enumerator":"2.2","html_id":"optimal-policy-lqr-linear","key":"GoYaMnqfFi"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"The construction (and inductive proof) proceeds similarly to the one ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"AK26ETRY9Q"},{"type":"crossReference","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"f5FUtHVcW3"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"fPaEdogObm"},{"type":"text","value":".","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"lZMPONg9rs"}],"key":"lY6xYOVDUS"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":333,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"We’ll compute ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"L7C1TLnHDn"},{"type":"inlineMath","value":"V_\\hor^\\star","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"VHV_\\hor^\\starVH","key":"wRyuBVkC1t"},{"type":"text","value":" (at the end of the horizon) as our base case.","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"r17esaUqm3"}],"key":"M6pe4TqWml"},{"type":"listItem","spread":true,"position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Then we’ll work step-by-step backwards in time, using ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"msYQc4SMLW"},{"type":"inlineMath","value":"V_{\\hi+1}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"Vh+1V_{\\hi+1}^\\starVh+1","key":"J0eVcisFNw"},{"type":"text","value":" to compute ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"SBKkBnejZ8"},{"type":"inlineMath","value":"Q_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"QhQ_\\hi^\\starQh","key":"btEun0PHQ8"},{"type":"text","value":", ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"JfJErKELEp"},{"type":"inlineMath","value":"\\pi_{\\hi}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"πh\\pi_{\\hi}^\\starπh","key":"qC5xBsR1kt"},{"type":"text","value":", and ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"nMiK25qbI1"},{"type":"inlineMath","value":"V_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"VhV_\\hi^\\starVh","key":"ne3nWfiKKs"},{"type":"text","value":".","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"Lz0vDQ2ZJu"}],"key":"XxeHop5MvR"}],"key":"o3soQyuxWY"},{"type":"comment","value":" TODO insert reference for proof by induction ","key":"O085FDlUUU"},{"type":"paragraph","position":{"start":{"line":338,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"strong","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"children":[{"type":"text","value":"Base case:","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"feFJjlQIPr"}],"key":"SpOFdAxX2y"},{"type":"text","value":"\nAt the final timestep,\nthere are no possible actions to take,\nand so ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"yKoDM0NlW5"},{"type":"inlineMath","value":"V^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\st","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=c(x)=xQxV^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\stVH(x)=c(x)=xQx","key":"ErkKvtwSDn"},{"type":"text","value":".\nThus ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"H9807kJjU5"},{"type":"inlineMath","value":"V_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\hor","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=xPHx+pHV_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\horVH(x)=xPHx+pH","key":"LrdezXnijs"},{"type":"text","value":"\nwhere ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"Z2gMLGceBk"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"XS29i7ncaU"},{"type":"text","value":" and ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"sgjokKfzJ7"},{"type":"inlineMath","value":"p_\\hor = 0","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"pH=0p_\\hor = 0pH=0","key":"UPa8XqqmWh"},{"type":"text","value":".","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"QwBLygqGgg"}],"key":"j9tqYyfwWd"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"strong","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"Inductive hypothesis:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"ARG0a5L6zU"}],"key":"S39T6ujfTj"},{"type":"text","value":"\nWe seek to show that the inductive step holds for both theorems:\nIf ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"oMYeQvOvzE"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh+1(x)V^\\star_{\\hi+1}(\\st)Vh+1(x)","key":"WUvX2Xmv2G"},{"type":"text","value":" is an upward-curved quadratic,\nthen ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"pnzB31tWs0"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"qXBhLd8grF"},{"type":"text","value":" must also be an upward-curved quadratic,\nand ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"ABtkahjBMD"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"πh(x)\\pi^\\star_\\hi(\\st)πh(x)","key":"oHwwOViAP2"},{"type":"text","value":" must be linear.\nWe’ll break this down into the following steps:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"opZ8qAgCN9"}],"key":"rRqZJcgzx9"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":352,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"WPqbkUZZdg"},{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"pjtKrsfYde"},{"type":"text","value":" is an upward-curved quadratic (in both\n","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"EUR0KWQ4M5"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"x\\stx","key":"F9tbQcfuX4"},{"type":"text","value":" and ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"aGEq61hiF3"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"u\\actu","key":"IuB7j1ja6l"},{"type":"text","value":").","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"axl5zKkxYp"}],"key":"WVacC1J6oX"},{"type":"listItem","spread":true,"position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"text","value":"Derive the optimal policy\n","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"EEsNheg0bd"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"html":"πh(x)=argminuQh(x,u)\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)πh(x)=argminuQh(x,u)","key":"jgmOpiTTqf"},{"type":"text","value":" and show\nthat it’s linear.","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"Crj05IH4JE"}],"key":"ZzWE0vm512"},{"type":"listItem","spread":true,"position":{"start":{"line":357,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"vsyTW4fJ4G"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"rWmKNquBg1"},{"type":"text","value":" is an upward-curved quadratic.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"ynxHz4LigB"}],"key":"YfdsFKoDPv"}],"key":"Ri9DKsFVg6"},{"type":"paragraph","position":{"start":{"line":359,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"text","value":"We first assume the inductive hypothesis that our theorems are true at\ntime ","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"YTUkSwro3U"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"html":"h+1\\hi+1h+1","key":"lv7irUP9GT"},{"type":"text","value":". That is,","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"irfatJlLtM"}],"key":"OizcBhV7Tr"},{"type":"math","value":"V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.","position":{"start":{"line":362,"column":1},"end":{"line":364,"column":1}},"html":"Vh+1(x)=xPh+1x+ph+1xS.V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.Vh+1(x)=xPh+1x+ph+1xS.","enumerator":"2.11","key":"et9HTtPerg"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"RD9AphX1Rd"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"key":"ohrbSD5XGW"}],"key":"uD5E2h053U"},{"type":"paragraph","position":{"start":{"line":367,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Let us decompose ","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"Vd7yxEtkCE"},{"type":"inlineMath","value":"Q^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"Qh:S×ARQ^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qh:S×AR","key":"aAmgiJmRsb"},{"type":"text","value":"\ninto the immediate reward plus the expected cost-to-go:","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"SdslDqNgSt"}],"key":"n2psj9Md3u"},{"type":"math","value":"Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].","position":{"start":{"line":370,"column":1},"end":{"line":372,"column":1}},"html":"Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].","enumerator":"2.12","key":"lj0W9mjDV0"},{"type":"paragraph","position":{"start":{"line":374,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"Recall ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"llyTxNWLXw"},{"type":"inlineMath","value":"c(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\act","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"c(x,u):=xQx+uRuc(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\actc(x,u):=xQx+uRu","key":"oS3VZAKKgF"},{"type":"text","value":".\nLet’s consider the expectation over the next timestep.\nThe only randomness in the dynamics comes from the noise\n","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"EjOM0qTKbt"},{"type":"inlineMath","value":"w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"wh+1N(0,σ2I)w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)wh+1N(0,σ2I)","key":"wwo85oQfCF"},{"type":"text","value":",\nso we can expand the expectation as:","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"mczhJsP8Ns"}],"key":"aP88zkLh4z"},{"type":"math","value":"\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}","position":{"start":{"line":380,"column":1},"end":{"line":386,"column":1}},"html":"Ex[Vh+1(x)]=Ewh+1[Vh+1(Ax+Bu+wh+1)]definition of f=Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].inductive hypothesis\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}==Ex[Vh+1(x)]Ewh+1[Vh+1(Ax+Bu+wh+1)]Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].definition of finductive hypothesis","enumerator":"2.13","key":"qYAlMAx2xz"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"text","value":"Summing and combining like terms, we get","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"pVFUSG9kOR"}],"key":"I7HE2oPhx2"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":390,"column":1},"end":{"line":396,"column":1}},"html":"Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.","enumerator":"2.14","key":"rWC0187JFV"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Note that the terms that are linear in ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"QoBNdX892u"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"whw_\\hiwh","key":"uOwTBqzrAq"},{"type":"text","value":" have mean\nzero and vanish. Now consider the remaining expectation over the noise.\nBy expanding out the product and using linearity of expectation, we can\nwrite this out as","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"OSmbfqreb4"}],"key":"APednm64Y0"},{"type":"math","value":"\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}","position":{"start":{"line":403,"column":1},"end":{"line":408,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)","enumerator":"2.15","key":"iHvatTGaWp"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Quadratic forms","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"UnssONB3HZ"}],"key":"w3NXou1YYF"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"When solving ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"JwavD42hZx"},{"type":"emphasis","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"children":[{"type":"text","value":"quadratic forms","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"cs9k1YuI1u"}],"key":"GIuh99TZ4e"},{"type":"text","value":", i.e. expressions of the form ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"tYGWSMcIYu"},{"type":"inlineMath","value":"x^\\top A x","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"xAxx^\\top A xxAx","key":"ar0Ml1WInM"},{"type":"text","value":",\nit’s often helpful to consider the terms on the diagonal (","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"F9wI7kvAmh"},{"type":"inlineMath","value":"i = j","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"i=ji = ji=j","key":"xGRryxCtiC"},{"type":"text","value":") separately from those off the diagonal.","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"raJivYyn90"}],"key":"Z4MuCRM3m3"},{"type":"paragraph","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"In this case, the expectation of each diagonal term becomes","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"jwNVZCkNNR"}],"key":"plvxPQPX1h"},{"type":"math","value":"(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.","position":{"start":{"line":417,"column":1},"end":{"line":419,"column":1}},"html":"(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.","enumerator":"2.16","key":"n9GE7EeIJM"},{"type":"paragraph","position":{"start":{"line":421,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"Off the diagonal, since the elements of ","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"smrac6DeQB"},{"type":"inlineMath","value":"w_{\\hi+1}","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"html":"wh+1w_{\\hi+1}wh+1","key":"YQpxKV9MWq"},{"type":"text","value":" are independent, the\nexpectation factors, and since each element has mean zero, the term\nvanishes:","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"mqmXsexq2d"}],"key":"be5c7pe0nv"},{"type":"math","value":"(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.","position":{"start":{"line":425,"column":1},"end":{"line":427,"column":1}},"html":"(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.","enumerator":"2.17","key":"CJoygLrATv"},{"type":"paragraph","position":{"start":{"line":429,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"text","value":"Thus,\nthe only terms left are the ones on the diagonal,\nso the sum of these can be expressed as the trace of ","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"qaL9OXbDSK"},{"type":"inlineMath","value":"\\sigma^2 P_{\\hi+1}","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"html":"σ2Ph+1\\sigma^2 P_{\\hi+1}σ2Ph+1","key":"b6A8JG9kvu"},{"type":"text","value":":","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"ev2CLGSibd"}],"key":"JWfENGmUH1"},{"type":"math","value":"\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).","position":{"start":{"line":433,"column":1},"end":{"line":435,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).","enumerator":"2.18","key":"Sq3CvxJkdv"}],"key":"ERbBSIJogh"},{"type":"paragraph","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"children":[{"type":"text","value":"Substituting this back into the expression for ","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"AwNMgw4nTx"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"B8mmTH8gHI"},{"type":"text","value":", we have:","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"HjjZebcPL0"}],"key":"TF8yuTtiOY"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":440,"column":1},"end":{"line":446,"column":1}},"html":"Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.","enumerator":"2.19","key":"yItkZES4h0"},{"type":"paragraph","position":{"start":{"line":448,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"text","value":"As we hoped, this expression is quadratic in ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"ZIbDUnZwfl"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"x\\stx","key":"L4nkk3EZva"},{"type":"text","value":" and ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"lxRMWjlpiD"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"y9rMll30Yz"},{"type":"text","value":".\nFurthermore,\nwe’d like to show that it also ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"n788IrNMjq"},{"type":"emphasis","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"curves upwards","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"Q3ZlOVj8mA"}],"key":"RKf7nDSWV3"},{"type":"text","value":"\nwith respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"X8YAL2fwtw"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"dCwv5Ls0WG"},{"type":"text","value":"\nso that its minimum with respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"tFuyMOXLfZ"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"OgXNev6kBs"},{"type":"text","value":" is well-defined.\nWe can do this by noting that the ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"HsurFaRzYE"},{"type":"strong","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"Hessian matrix","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"dpjY2e4wDn"}],"key":"jzTpE56fU1"},{"type":"text","value":" of second derivatives is positive definite:","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"eXy0gQFoAE"}],"key":"yZtKL90HfH"},{"type":"math","value":"\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} B","position":{"start":{"line":455,"column":1},"end":{"line":457,"column":1}},"html":"uuQh(x,u)=R+BPh+1B\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} BuuQh(x,u)=R+BPh+1B","enumerator":"2.20","key":"P6rKHqdUrQ"},{"type":"paragraph","position":{"start":{"line":459,"column":1},"end":{"line":464,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"zedhswdY3Z"},{"type":"inlineMath","value":"R","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"RRR","key":"iYOCoDAlJS"},{"type":"text","value":" is s.p.d. (by ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"E60GZeT2xF"},{"type":"crossReference","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"children":[{"type":"text","value":"the LQR definition","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"J75hIl7omu"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"cttZ1MWIww"},{"type":"text","value":"),\nand ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"pxtzl8a9Wv"},{"type":"inlineMath","value":"P_{\\hi+1}","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"Ph+1P_{\\hi+1}Ph+1","key":"vST4I5f33e"},{"type":"text","value":" is s.p.d. (by the inductive hypothesis),\nthis sum must also be s.p.d.,\nand so ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"WrayjFTlKN"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"hLRFgOocVy"},{"type":"text","value":" is indeed an upward-curved quadratic with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"e6m0SMulFO"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"u\\actu","key":"SSOuOceIWF"},{"type":"text","value":".\n(If this isn’t clear, try proving it as an exercise.)\nThe proof of its upward curvature with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"qjDtYLi65J"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"x\\stx","key":"TqDCFCWh39"},{"type":"text","value":" is equivalent.","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"woCSP0iSm0"}],"key":"u4DPZgJNJ8"}],"enumerator":"2.1","key":"VgbEUlfpHD"},{"type":"proof","kind":"lemma","label":"lemma_pi_linear","identifier":"lemma_pi_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"AmcFlhJEGT"},{"type":"text","value":" is linear","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"KeL5tNdOyf"}],"key":"LuwUPiIvQv"},{"type":"paragraph","position":{"start":{"line":470,"column":1},"end":{"line":473,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"ofrdqsY5vn"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"MSRGuRgZCk"},{"type":"text","value":" is an upward-curved quadratic,\nfinding its minimum over ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"FrOG2qUA61"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"fQ0goFzkE0"},{"type":"text","value":" is easy:\nwe simply set the gradient with respect to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"gVaQm5d6e2"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"iaMtWEN8oY"},{"type":"text","value":" equal to zero and solve for ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"vsNOudJWd7"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"xaCOpMWItt"},{"type":"text","value":".\nFirst, we calculate the gradient:","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"wBpANVQybK"}],"key":"kMkkpbtt1K"},{"type":"math","value":"\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}","position":{"start":{"line":475,"column":1},"end":{"line":480,"column":1}},"html":"uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)","enumerator":"2.21","key":"cY1eBHRp2Q"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"Setting this to zero, we get","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"nYNWMRgbTk"}],"key":"rcIaVhvwQG"},{"type":"math","value":"\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}","position":{"start":{"line":484,"column":1},"end":{"line":490,"column":1}},"html":"0=(R+BPh+1B)πh(x)+BPh+1Axπh(x)=(R+BPh+1B)1(BPh+1Ax)=Khx,\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}0πh(x)=(R+BPh+1B)πh(x)+BPh+1Ax=(R+BPh+1B)1(BPh+1Ax)=Khx,","enumerator":"2.22","key":"S4lFVrR2av"},{"type":"paragraph","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"wyvpB7Sdb5"}],"key":"ZuoCFxxAJ5"},{"type":"math","value":"K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"identifier":"k_pi","label":"k_pi","html_id":"k-pi","html":"Kh=(R+BPh+1B)1BPh+1A.K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Kh=(R+BPh+1B)1BPh+1A.","enumerator":"2.23","key":"k8Ii7DtvfD"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"text","value":"Note that this optimal policy doesn’t depend on the starting distribution ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"INt6z8e5BC"},{"type":"inlineMath","value":"\\mu_0","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"μ0\\mu_0μ0","key":"QSduOxBySh"},{"type":"text","value":".\nIt’s also fully ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"JbcKMIpKBm"},{"type":"strong","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"eFrfmWtSQM"}],"key":"k6b8MrbKrf"},{"type":"text","value":" and isn’t affected by the noise terms\n","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"eDem0DVY0X"},{"type":"inlineMath","value":"w_0, \\dots, w_{\\hor-1}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"w0,,wH1w_0, \\dots, w_{\\hor-1}w0,,wH1","key":"idOMH412GP"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"Yoh6JKUxY8"}],"key":"dmfist7OmL"}],"enumerator":"2.2","html_id":"lemma-pi-linear","key":"KZppKr46tQ"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"d8vtKJtyuq"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"key":"WFrMOvfVPf"}],"key":"Dzd1v9DtQB"},{"type":"paragraph","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"children":[{"type":"text","value":"Using the identity ","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"Qqa7NTkUW9"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"html":"Vh(x)=Qh(x,π(x))V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))Vh(x)=Qh(x,π(x))","key":"N5666A1FBy"},{"type":"text","value":", we have:","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"P3QummQOa7"}],"key":"W2IHO9eYJy"},{"type":"math","value":"\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}","position":{"start":{"line":505,"column":1},"end":{"line":512,"column":1}},"html":"Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1","enumerator":"2.24","key":"nuO0bYPUj8"},{"type":"paragraph","position":{"start":{"line":514,"column":1},"end":{"line":517,"column":1}},"children":[{"type":"text","value":"Note that with respect to ","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"JDQVRIGnJL"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"html":"x\\stx","key":"ZMwZMOHD0g"},{"type":"text","value":",\nthis is the sum of a quadratic term and a constant,\nwhich is exactly what we were aiming for!\nThe scalar term is clearly","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"a5FRqf5GB0"}],"key":"tnJY8Kd1z8"},{"type":"math","value":"p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"ph=Tr(σ2Ph+1)+ph+1.p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.ph=Tr(σ2Ph+1)+ph+1.","enumerator":"2.25","key":"uaZsqaKhcr"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"We can simplify the quadratic term by substituting in ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Zr2MvPXpfe"},{"type":"inlineMath","value":"K_\\hi","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"KhK_\\hiKh","key":"ssHl1s61mi"},{"type":"text","value":" from ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"YIV8fTeGSC"},{"type":"crossReference","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"(","key":"Q5KBRqGN3T"},{"type":"text","value":"2.23","key":"Nka5P0VBV9"},{"type":"text","value":")","key":"LWKvstC69o"}],"identifier":"k_pi","label":"k_pi","kind":"equation","template":"(%s)","enumerator":"2.23","resolved":true,"html_id":"k-pi","key":"kocTKoZZXs"},{"type":"text","value":".\nNotice that when we do this,\nthe ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"An5icZ2MVz"},{"type":"inlineMath","value":"(R+B^\\top P_{\\hi+1} B)","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"(R+BPh+1B)(R+B^\\top P_{\\hi+1} B)(R+BPh+1B)","key":"wlekvUdrxT"},{"type":"text","value":" term in the expression is cancelled out by its inverse,\nand the remaining terms combine to give the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"eRSN4RmiOl"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Ph9tidrwzg"}],"key":"DBLzFwSYwI"},{"type":"text","value":":","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"zhhsAnf7dE"}],"key":"AoSgxHSDJL"},{"type":"proof","kind":"definition","label":"riccati","identifier":"riccati","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"srUnWG4i5B"}],"key":"MkgnwWnwcA"},{"type":"math","value":"P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":529,"column":1},"end":{"line":531,"column":1}},"html":"Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.","enumerator":"2.26","key":"tWtmgggnaC"}],"enumerator":"2.5","html_id":"riccati","key":"CV8MRy83Ak"},{"type":"paragraph","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"There are several nice properties to note about the Riccati equation:","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"EHpSbHOXb8"}],"key":"CWvFnA5v1T"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":536,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":536,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"It’s defined ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"Z9pD25a5JF"},{"type":"strong","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"children":[{"type":"text","value":"recursively.","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"erRZr8f0Q1"}],"key":"a8tybTjmFI"},{"type":"text","value":"\nGiven the dynamics defined by ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"cHzxdVMm3L"},{"type":"inlineMath","value":"A","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"AAA","key":"eXyZiqQ25Z"},{"type":"text","value":" and ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"SJCmohsmGw"},{"type":"inlineMath","value":"B","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"BBB","key":"gYv6yiDwzN"},{"type":"text","value":", and the state cost matrix ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"TEo9rtq1Eg"},{"type":"inlineMath","value":"Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"QQQ","key":"saXXvroitj"},{"type":"text","value":",\nwe can recursively calculate ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"Q5Sw3cmuLS"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PhP_\\hiPh","key":"qqxR1Pkmrm"},{"type":"text","value":" across all timesteps starting from ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"qBHtjFEGmV"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"bmsjSBxsEG"},{"type":"text","value":".","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"yRL45pbhcM"}],"key":"fuwNAyiUM4"},{"type":"listItem","spread":true,"position":{"start":{"line":539,"column":1},"end":{"line":540,"column":1}},"children":[{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"PhP_\\hiPh","key":"Zml7szpXeu"},{"type":"text","value":" often appears in calculations surrounding optimality,\nsuch as ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"tNRZnlGpoX"},{"type":"inlineMath","value":"V^\\star_\\hi, Q^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"Vh,QhV^\\star_\\hi, Q^\\star_\\hiVh,Qh","key":"qEXAk1FY8C"},{"type":"text","value":", and ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"WVd8btYXeM"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"DfLBCvo2t0"},{"type":"text","value":".","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"MwCa2vr8HI"}],"key":"EbOc10gwez"},{"type":"listItem","spread":true,"position":{"start":{"line":541,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Together with the dynamics given by ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"F2OyVQQsr2"},{"type":"inlineMath","value":"A","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"AAA","key":"J8ggQc4fEc"},{"type":"text","value":" and ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"MICvRTzZRb"},{"type":"inlineMath","value":"B","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"BBB","key":"Yswa9Gs0yg"},{"type":"text","value":",\nand the action coefficients ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"HBSugnAWxx"},{"type":"inlineMath","value":"R","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"RRR","key":"EQfgb4Loxx"},{"type":"text","value":" in the lost function,\nit fully defines the optimal policy ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"CWL5MDOd9d"},{"type":"crossReference","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"l6CwpWZ1Q4"},{"type":"text","value":"2.2","key":"bXtUeRYvsD"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"S6r2Zwp9Bb"},{"type":"text","value":".","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"XhVYKFiIku"}],"key":"rLy5w9OVX5"}],"key":"Vp6cHmmyU4"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"It remains to prove that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"TK4ccmbxG4"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"VhV^\\star_\\hiVh","key":"jyLtvTjHHB"},{"type":"text","value":" ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"VzRM9narss"},{"type":"emphasis","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"curves upwards,","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"mWu1EpQTdQ"}],"key":"WmFMde9fz0"},{"type":"text","value":" that is, that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"wSJyVDTHSc"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"PhP_\\hiPh","key":"rKV0LufAMJ"},{"type":"text","value":" is s.p.d. We will use the following fact about ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"BfOvIbq2aJ"},{"type":"strong","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"Schur complements:","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"oabbxvBoyn"}],"key":"xkU8hAEiHK"}],"key":"C93lDhlDFQ"},{"type":"proof","kind":"lemma","label":"lemma_schur","identifier":"lemma_schur","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Positive definiteness of Schur complements","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"dGxoLOuPA9"}],"key":"iVLWHJjt2o"},{"type":"paragraph","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"Let","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"OGLykWYHnx"}],"key":"n7h1EKMmmh"},{"type":"math","value":"D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}","position":{"start":{"line":552,"column":1},"end":{"line":557,"column":1}},"html":"D=(ABBC)D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}D=(ABBC)","enumerator":"2.27","key":"np3Dr8uEEY"},{"type":"paragraph","position":{"start":{"line":559,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"be a symmetric ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"fYDBnrnVR5"},{"type":"inlineMath","value":"(m+n) \\times (m+n)","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"(m+n)×(m+n)(m+n) \\times (m+n)(m+n)×(m+n)","key":"zTYTrger1V"},{"type":"text","value":" block matrix,\nwhere ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"CVlmpzgult"},{"type":"inlineMath","value":"A \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"ARm×m,BRm×n,CRn×nA \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}ARm×m,BRm×n,CRn×n","key":"HONCClpXgY"},{"type":"text","value":".\nThe ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"L4jkPKbVhA"},{"type":"strong","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"children":[{"type":"text","value":"Schur complement","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"Uf8ZzICHE9"}],"key":"OumZn59mM8"},{"type":"text","value":" of ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"lCGTwa6Spi"},{"type":"inlineMath","value":"A","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"AAA","key":"IMto66h9FV"},{"type":"text","value":" is denoted","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"qEcPWdX5mW"}],"key":"sk61YNr0cI"},{"type":"math","value":"D/A = C - B^\\top A^{-1} B.","position":{"start":{"line":563,"column":1},"end":{"line":565,"column":1}},"html":"D/A=CBA1B.D/A = C - B^\\top A^{-1} B.D/A=CBA1B.","enumerator":"2.28","key":"iyubPLLl4z"},{"type":"paragraph","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"children":[{"type":"text","value":"Schur complements have various uses in linear algebra and numerical computation.","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"key":"YOu6GQCRdY"}],"key":"dMfatFeaN1"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":572,"column":1}},"children":[{"type":"text","value":"A useful fact for us is that\nif ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"OjO4V3KW0C"},{"type":"inlineMath","value":"A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"AAA","key":"FSpeqojQNl"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"oEvJIFueVH"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"definite,","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"QrpQ5WpT2i"}],"key":"fzoAPt5nE2"},{"type":"text","value":"\nthen ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"HZvJo5n7zE"},{"type":"inlineMath","value":"D","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"DDD","key":"KfXnh8QDm8"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"EyTr5zSs7c"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"TTFnm40WAz"}],"key":"xeEYjG9X68"},{"type":"text","value":"\nif and only if ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"OXLciwINJJ"},{"type":"inlineMath","value":"D/A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"D/AD/AD/A","key":"qi72YFUQH4"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"PLMVB3AE8t"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"yvCuXJacjh"}],"key":"iCR03l1uBs"},{"type":"text","value":".","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"BKmeqCSXHh"}],"key":"YNRiYxr238"}],"enumerator":"2.4","html_id":"lemma-schur","key":"iBh1OWpenv"},{"type":"paragraph","position":{"start":{"line":575,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"PJq0xyHWQ1"},{"type":"inlineMath","value":"P","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"PPP","key":"revKPuphA5"},{"type":"text","value":" denote ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"v6l5EJGYP0"},{"type":"inlineMath","value":"P_{\\hi + 1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"Ph+1P_{\\hi + 1}Ph+1","key":"TpdanJX5v5"},{"type":"text","value":" for brevity.\nWe already know ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"VOuIjK8Lgg"},{"type":"inlineMath","value":"Q","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"QQQ","key":"NR2p0BEZBT"},{"type":"text","value":" is p.d.,\nso it suffices to show that","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"ouE6yRURG3"}],"key":"tAN8K7gV1m"},{"type":"math","value":"S = P - P B (R + B^\\top P B)^{-1} B^\\top P","position":{"start":{"line":579,"column":1},"end":{"line":581,"column":1}},"html":"S=PPB(R+BPB)1BPS = P - P B (R + B^\\top P B)^{-1} B^\\top PS=PPB(R+BPB)1BP","enumerator":"2.29","key":"SLTtTvJe0o"},{"type":"paragraph","position":{"start":{"line":583,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"is p.s.d. (positive semidefinite),\nsince left- and right- multiplying by ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"FndU7FlbzE"},{"type":"inlineMath","value":"A^\\top","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AA^\\topA","key":"Jn5ZkRV343"},{"type":"text","value":" and ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"MwDPwGyMvM"},{"type":"inlineMath","value":"A","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AAA","key":"dmeQKmeZM8"},{"type":"text","value":" respectively\npreserves p.s.d.\nWe note that ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"Ej2QADo7R4"},{"type":"inlineMath","value":"S","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"SSS","key":"OpjN7dUnIy"},{"type":"text","value":" is the Schur complement ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"BOhgzVRrjq"},{"type":"inlineMath","value":"D/(R + B^\\top P B)","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"D/(R+BPB)D/(R + B^\\top P B)D/(R+BPB)","key":"SgwAdoCFL7"},{"type":"text","value":", where","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"N0wc0rdgRt"}],"key":"IxsDVLBeN4"},{"type":"math","value":"D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.","position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"html":"D=(R+BPBBPPBP).D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.D=(R+BPBPBBPP).","enumerator":"2.30","key":"wUaHPXuQMc"},{"type":"paragraph","position":{"start":{"line":595,"column":1},"end":{"line":596,"column":1}},"children":[{"type":"text","value":"Thus we must show that ","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"lv0CwfQ4Jf"},{"type":"inlineMath","value":"D","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"html":"DDD","key":"cXeSrP1Cbh"},{"type":"text","value":" is p.s.d..\nThis can be seen by computing","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"SyPOaB4Wwf"}],"key":"EqMXLTi5ap"},{"type":"math","value":"\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}","position":{"start":{"line":598,"column":1},"end":{"line":611,"column":1}},"html":"(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.","enumerator":"2.31","key":"Cy4yUBabn0"},{"type":"paragraph","position":{"start":{"line":613,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"sxLDm8oyLx"},{"type":"inlineMath","value":"R + B^\\top P B","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"R+BPBR + B^\\top P BR+BPB","key":"aV9tbH3qJM"},{"type":"text","value":" is p.d. and ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"X9r5SW6z9w"},{"type":"inlineMath","value":"D","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"DDD","key":"x7I5HquozV"},{"type":"text","value":" is p.s.d.,\nthen ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"UO71FRzPxZ"},{"type":"inlineMath","value":"S = D / (R + B^\\top P B)","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"S=D/(R+BPB)S = D / (R + B^\\top P B)S=D/(R+BPB)","key":"xYKvGPcF6Y"},{"type":"text","value":" must be p.s.d.,\nand ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"Uue4Sg3hvA"},{"type":"inlineMath","value":"P_\\hi = Q + A S A^\\top","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"Ph=Q+ASAP_\\hi = Q + A S A^\\topPh=Q+ASA","key":"LWDTcvbasS"},{"type":"text","value":" must be p.d.","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"euK8eKuGV1"}],"key":"to5A1v4bHw"}],"enumerator":"2.3","key":"VWzAp6ebWf"},{"type":"paragraph","position":{"start":{"line":618,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"Now we’ve shown that ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"QCVs1BofCJ"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","key":"ExKvW7Hgme"},{"type":"text","value":",\nwhere ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"eK4tTE73EA"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"PhP_\\hiPh","key":"ahL3tYvoyD"},{"type":"text","value":" is s.p.d.,\nproving the inductive hypothesis and completing the proof of ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"wB9SGmZd9x"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"DJw5nBgTDn"},{"type":"text","value":"2.2","key":"X1KBQkN0aF"}],"identifier":"optimal_policy_lqr_linear","label":"optimal_policy_lqr_linear","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.2","resolved":true,"html_id":"optimal-policy-lqr-linear","key":"N937QonH9Y"},{"type":"text","value":" and ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"o49RPxNjge"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"OnP1yJaREQ"},{"type":"text","value":"2.1","key":"zA8SHKtRtV"}],"identifier":"optimal_value_lqr_quadratic","label":"optimal_value_lqr_quadratic","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.1","resolved":true,"html_id":"optimal-value-lqr-quadratic","key":"FbVFY1Bw9a"},{"type":"text","value":".","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"zVv69126hR"}],"key":"S70Q1fA3Im"},{"type":"paragraph","position":{"start":{"line":622,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"In summary, we just demonstrated that at each timestep ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"rbG5UtsJ0F"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"NeeLq4MnCD"},{"type":"text","value":",\nthe optimal value function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"xgpJGJZIa1"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"VhV^\\star_\\hiVh","key":"aiado6KSO9"},{"type":"text","value":"\nand optimal Q-function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"XHyO7Zuk9I"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"DBHjNHEphL"},{"type":"text","value":" are both upward-curved quadratics\nand the optimal policy ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"uoP5pyZEOs"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"eXQX86kAhk"},{"type":"text","value":" is linear.\nWe also showed that all of these quantities can be calculated\nusing a sequence of s.p.d. matrices ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"eWztRHFZci"},{"type":"inlineMath","value":"P_0, \\dots, P_H","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"P0,,PHP_0, \\dots, P_HP0,,PH","key":"Q4fsWYJA8u"},{"type":"text","value":"\nthat can be defined recursively using the Riccati equation ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"aMzXlyZ1W8"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"IaOSHxd5jo"},{"type":"text","value":"2.5","key":"HZ63q115o0"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"MbsLDe0OPT"},{"type":"text","value":".","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"dOZLSiyTtK"}],"key":"cdmigeGYh8"},{"type":"paragraph","position":{"start":{"line":630,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"Before we move on to some extensions of LQR, let’s consider how the\nstate at time ","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"cA3ENCYfZh"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"html":"h\\hih","key":"HB6gwC6rNA"},{"type":"text","value":" behaves when we act according to this optimal\npolicy.","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"yMRhXOBVr6"}],"key":"EzYCnS0QQu"},{"type":"heading","depth":3,"position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"Expected state at time ","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"ojmpqcKHl0"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"html":"h\\hih","key":"BKWIhK8Vsm"}],"identifier":"expected-state-at-time-hi","label":"Expected state at time \\hi","html_id":"expected-state-at-time-hi","implicit":true,"enumerator":"2.4.1","key":"iPDFVUiis9"},{"type":"paragraph","position":{"start":{"line":636,"column":1},"end":{"line":639,"column":1}},"children":[{"type":"text","value":"How can we compute the expected state at time ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"NdBzoUSVpX"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"h\\hih","key":"B0cRdyK6pt"},{"type":"text","value":" when acting\naccording to the optimal policy? Let’s first express ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"BbJgKNmI0m"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"xh\\st_\\hixh","key":"Gr3ywp2F51"},{"type":"text","value":" in a\ncleaner way in terms of the history. Note that having linear dynamics\nmakes it easy to expand terms backwards in time:","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"iCthcokLL3"}],"key":"sbDCoaq4NL"},{"type":"math","value":"\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}","position":{"start":{"line":641,"column":1},"end":{"line":648,"column":1}},"html":"xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).","enumerator":"2.32","key":"NRKjvlnkSY"},{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":655,"column":1}},"children":[{"type":"text","value":"Let’s consider the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"Q9dRZfVAHj"},{"type":"emphasis","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"children":[{"type":"text","value":"average state","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"oHgVF47cSa"}],"key":"TKRvtQ7r0b"},{"type":"text","value":" at this time, given all the past\nstates and actions. Since we assume that ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"i0yJIw2xKg"},{"type":"inlineMath","value":"\\E [w_\\hi] = 0","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"E[wh]=0\\E [w_\\hi] = 0E[wh]=0","key":"sWkGlq94xf"},{"type":"text","value":" (this is the\nzero vector in ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"kJrxrAiXva"},{"type":"inlineMath","value":"d","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"ddd","key":"jFTn9TdINF"},{"type":"text","value":" dimensions), when we take an expectation, the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"WlPfDr7q9h"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"whw_\\hiwh","key":"n0MNBBxnXt"},{"type":"text","value":"\nterm vanishes due to linearity, and so we’re left with","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"cWTUJDuOBG"}],"key":"yNOiQnFJ38"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.","position":{"start":{"line":658,"column":1},"end":{"line":661,"column":1}},"identifier":"expected_state","label":"expected_state","html_id":"expected-state","html":"E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.","enumerator":"2.33","key":"HGP749xiU6"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"wcjNyafQfZ"}],"key":"uJLv0PqI7q"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Show that if we choose actions according to the optimal policy ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"Nhznn4QqK7"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"yOSByIH5Tv"},{"type":"text","value":"2.2","key":"gkc8IQQ80D"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"SCvQ5dQzaL"},{"type":"text","value":", ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"cN5RgwktN2"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"(","key":"UYG85FTIeE"},{"type":"text","value":"2.33","key":"toApKsW0Fx"},{"type":"text","value":")","key":"xsBeI8GYi8"}],"identifier":"expected_state","label":"expected_state","kind":"equation","template":"(%s)","enumerator":"2.33","resolved":true,"html_id":"expected-state","key":"bF0om2YfIn"},{"type":"text","value":" becomes","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"wWlP2TZYnF"}],"key":"q1ei9jXaHn"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.","position":{"start":{"line":667,"column":1},"end":{"line":669,"column":1}},"html":"E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.","enumerator":"2.34","key":"XQf3lq6wY3"}],"key":"Uag0MXGb4k"},{"type":"paragraph","position":{"start":{"line":672,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"This introdces the quantity ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"KP53Nwpud5"},{"type":"inlineMath","value":"A - B K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKiA - B K_iABKi","key":"o7R8r99ehI"},{"type":"text","value":", which shows up frequently in\ncontrol theory. For example, one important question is: will ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"iJDJmB4LfE"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xh\\st_\\hixh","key":"rFCv90rSVe"},{"type":"text","value":"\nremain bounded, or will it go to infinity as time goes on? To answer\nthis, let’s imagine for simplicity that these ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"QUESsFT03H"},{"type":"inlineMath","value":"K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KiK_iKi","key":"K33FHBhtCI"},{"type":"text","value":"s are equal (call\nthis matrix ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"YZhP0Yx1Sn"},{"type":"inlineMath","value":"K","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KKK","key":"M3vQ6UCnUm"},{"type":"text","value":"). Then the expression above becomes ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"XpORCYvXvp"},{"type":"inlineMath","value":"(A-BK)^\\hi \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"(ABK)hx0(A-BK)^\\hi \\st_0(ABK)hx0","key":"UwocF0bNS5"},{"type":"text","value":".\nNow consider the maximum eigenvalue ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"KqDfHqdxbM"},{"type":"inlineMath","value":"\\lambda_{\\max}","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax\\lambda_{\\max}λmax","key":"xCYU9gSuaO"},{"type":"text","value":" of ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"Gcq9GwDEhv"},{"type":"inlineMath","value":"A - BK","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKA - BKABK","key":"d1rriX02D0"},{"type":"text","value":". If\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"do4bXa2pBG"},{"type":"inlineMath","value":"|\\lambda_{\\max}| > 1","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax>1|\\lambda_{\\max}| > 1λmax>1","key":"xDOIDre105"},{"type":"text","value":", then there’s some nonzero initial state\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"IRo0W5D7yb"},{"type":"inlineMath","value":"\\bar \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xˉ0\\bar \\st_0xˉ0","key":"nXs4jIlQUp"},{"type":"text","value":", the corresponding eigenvector, for which","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"NHJF9zQIPg"}],"key":"pVwrrCQOsG"},{"type":"math","value":"\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.","position":{"start":{"line":682,"column":1},"end":{"line":686,"column":1}},"html":"limh(ABK)hxˉ0=limhλmaxhxˉ0=.\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.hlim(ABK)hxˉ0=hlimλmaxhxˉ0=∞.","enumerator":"2.35","key":"InJ3BLikoV"},{"type":"paragraph","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"children":[{"type":"text","value":"Otherwise, if ","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"u7SBkLMYKN"},{"type":"inlineMath","value":"|\\lambda_{\\max}| < 1","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"html":"λmax<1|\\lambda_{\\max}| < 1λmax<1","key":"HtAHJPCotR"},{"type":"text","value":", then it’s impossible for your original state to explode as dramatically.","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"GMcWucsikC"}],"key":"SLHBmB3xTR"},{"type":"heading","depth":2,"position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"children":[{"type":"text","value":"Extensions","position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"key":"qy1h5fEZbR"}],"identifier":"extensions","label":"Extensions","html_id":"extensions","implicit":true,"enumerator":"2.5","key":"bjxQSGt7h2"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":697,"column":1}},"children":[{"type":"text","value":"We’ve now formulated an optimal solution for the time-homogeneous LQR\nand computed the expected state under the optimal policy. However, real\nworld tasks rarely have such simple dynamics, and we may wish to design\nmore complex cost functions. In this section, we’ll consider more\ngeneral extensions of LQR where some of the assumptions we made above\nare relaxed. Specifically, we’ll consider:","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"tznG8K6lui"}],"key":"vyZjX55Fbk"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":699,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":699,"column":1},"end":{"line":701,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":699,"column":1},"end":{"line":700,"column":1}},"children":[{"type":"strong","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"children":[{"type":"text","value":"Time-dependency","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"DLMkCR2WR6"}],"key":"FtVd5XCQHZ"},{"type":"text","value":", where the dynamics and cost function might\nchange depending on the timestep.","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"Js3PQMi3mU"}],"key":"s7Oe7bBqdg"}],"key":"P4m8GR0jkt"},{"type":"listItem","spread":true,"position":{"start":{"line":702,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":702,"column":1},"end":{"line":703,"column":1}},"children":[{"type":"strong","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"General quadratic cost","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"JYuJcDYKWC"}],"key":"ipVhfYKfEw"},{"type":"text","value":", where we allow for linear terms and a\nconstant term.","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"q0CGTmthtB"}],"key":"eron0AvCso"}],"key":"xf8t9vYqE5"},{"type":"listItem","spread":true,"position":{"start":{"line":705,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":705,"column":1},"end":{"line":706,"column":1}},"children":[{"type":"strong","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"Tracking a goal trajectory","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"owEKE4nxJL"}],"key":"FxSsvgDhvL"},{"type":"text","value":" rather than aiming for a single goal\nstate-action pair.","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"a7SBs1p5GM"}],"key":"eEtaTQEO7M"}],"key":"B4iNiNnYjL"}],"key":"Veuq64cm5c"},{"type":"paragraph","position":{"start":{"line":708,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"Combining these will allow us to use the LQR solution to solve more\ncomplex setups by taking ","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"VrclT0p5G6"},{"type":"emphasis","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"V7doWNEiJR"}],"key":"w4GzTup3DM"},{"type":"text","value":" of the dynamics and\ncost functions.","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"de3SjtdbWE"}],"key":"D7vZlRCs5S"},{"type":"heading","depth":3,"position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"children":[{"type":"text","value":"Time-dependent dynamics and cost function","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"KAl2Ao0o4j"}],"label":"time_dep_lqr","identifier":"time_dep_lqr","html_id":"time-dep-lqr","enumerator":"2.5.1","key":"RSOcgVAI4W"},{"type":"paragraph","position":{"start":{"line":715,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"So far, we’ve considered the ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"fmU26305DR"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"a6TYJovEkN"}],"key":"fD3staecwO"},{"type":"text","value":" case, where the dynamics\nand cost function stay the same at every timestep. However, this might\nnot always be the case. As an example, in many sports, the rules and\nscoring system might change during an overtime period. To address these\nsorts of problems, we can loosen the time-homogeneous restriction, and\nconsider the case where the dynamics and cost function are\n","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"C7rhyl7k10"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-dependent.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"iYQJjCLC3b"}],"key":"LpoYPujc8L"},{"type":"text","value":" Our analysis remains almost identical; in fact, we can\nsimply add a time index to the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"FE1NPI3Xk4"},{"type":"inlineMath","value":"A","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"AAA","key":"Dg6GdZqT5d"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"UXfS4hV6HH"},{"type":"inlineMath","value":"B","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"BBB","key":"O5rysbgrLB"},{"type":"text","value":" that determine the\ndynamics and the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"UeQw5pwpej"},{"type":"inlineMath","value":"Q","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"QQQ","key":"SBbg47pY4R"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"ZSeXltFPZw"},{"type":"inlineMath","value":"R","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"RRR","key":"ZTtqAf4w1E"},{"type":"text","value":" that determine the cost.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"of9JURcI4L"}],"key":"uahUPXVi3O"},{"type":"paragraph","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"children":[{"type":"text","value":"The modified problem is now defined as follows:","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"key":"xnH2jo7vXV"}],"key":"p4aTeKDYex"},{"type":"proof","kind":"definition","label":"time_dependent_lqr","identifier":"time_dependent_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent LQR","position":{"start":{"line":727,"column":1},"end":{"line":727,"column":1}},"key":"LWaJKXXR8c"}],"key":"Sbbu1TKbn4"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":730,"column":1},"end":{"line":738,"column":1}},"html":"minπ0,,πH1E[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]wherexh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}π0,,πH1minwhereE[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).","enumerator":"2.36","key":"TBWuDwYjul"}],"enumerator":"2.6","html_id":"time-dependent-lqr","key":"s1uhH9f08c"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"The derivation of the optimal value functions and the optimal policy\nremains almost exactly the same, and we can modify the Riccati equation\naccordingly:","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"voR6YiX2gb"}],"key":"p6809FqERl"},{"type":"proof","kind":"definition","label":"riccati_time_dependent","identifier":"riccati_time_dependent","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent Riccati Equation","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"key":"YD6IwVvtis"}],"key":"kfq64bXbU3"},{"type":"math","value":"P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.","position":{"start":{"line":750,"column":1},"end":{"line":752,"column":1}},"html":"Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.","enumerator":"2.37","key":"PH08t8lDr0"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":756,"column":1}},"children":[{"type":"text","value":"Note that this is just the time-homogeneous Riccati equation\n(","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"rNdbJbstDz"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"yRLKA4Xhj6"},{"type":"text","value":"2.5","key":"LIZ87M0JcG"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"Qk27aKh0st"},{"type":"text","value":"), but with the time index added to each of the\nrelevant matrices.","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"hmVq8vZDeB"}],"key":"RiMSGP4LZG"}],"enumerator":"2.7","html_id":"riccati-time-dependent","key":"kPNn3UZENO"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":759,"column":1},"end":{"line":759,"column":1}},"key":"BnXx2MSJUD"}],"key":"iutMnppDpB"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Walk through the proof in ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"O8gTVcBy5X"},{"type":"crossReference","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Section ","key":"mMoTSHucbR"},{"type":"text","value":"2.4","key":"ObmWuuRDO7"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"yo319tHFFP"},{"type":"text","value":" to verify that we can simply add ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"huA4DF0ZkI"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"h\\hih","key":"zucNQezTRc"},{"type":"text","value":" for the time-dependent case.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"zO87Sc8Xio"}],"key":"eHSJTEU0IG"}],"key":"e6AiWxjaFE"},{"type":"paragraph","position":{"start":{"line":763,"column":1},"end":{"line":765,"column":1}},"children":[{"type":"text","value":"Additionally, by allowing the dynamics to vary across time, we gain the\nability to ","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"Y1BhS7pvV2"},{"type":"emphasis","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"hrtbB5trFX"}],"key":"PDL4IQ5cr1"},{"type":"text","value":" nonlinear dynamics at each timestep.\nWe’ll discuss this later in the chapter.","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"L8ySDFUS5X"}],"key":"V8QkRkuTJs"},{"type":"heading","depth":3,"position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"More general quadratic cost functions","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"nTcT7nLqH7"}],"identifier":"more-general-quadratic-cost-functions","label":"More general quadratic cost functions","html_id":"more-general-quadratic-cost-functions","implicit":true,"enumerator":"2.5.2","key":"MUn8eJNVS8"},{"type":"paragraph","position":{"start":{"line":769,"column":1},"end":{"line":776,"column":1}},"children":[{"type":"text","value":"Our original cost function had only second-order terms with respect to\nthe state and action, incentivizing staying as close as possible to\n","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"lbmAsZKJBc"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star) = (0, 0)","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"(x,u)=(0,0)(\\st^\\star, \\act^\\star) = (0, 0)(x,u)=(0,0)","key":"em5OwOptIM"},{"type":"text","value":". We can also consider more general\nquadratic cost functions that also have first-order terms and a constant\nterm. Combining this with time-dependent dynamics results in the\nfollowing expression, where we introduce a new matrix ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"Md2KJtIB4N"},{"type":"inlineMath","value":"M_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"MhM_\\hiMh","key":"HOgimH6PvN"},{"type":"text","value":" for the\ncross term, linear coefficients ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"IYOjFR4CCI"},{"type":"inlineMath","value":"q_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"qhq_\\hiqh","key":"zEIcLyBimS"},{"type":"text","value":" and ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"wrkfBxmwky"},{"type":"inlineMath","value":"r_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"rhr_\\hirh","key":"X3OPWi9sGr"},{"type":"text","value":" for the state and\naction respectively, and a constant term ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"bWOTTcnH46"},{"type":"inlineMath","value":"c_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"chc_\\hich","key":"LeKk0Q9cuJ"},{"type":"text","value":":","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"oPe8qpZ472"}],"key":"Ib97BTI1Oj"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.","label":"general_quadratic_cost","identifier":"general_quadratic_cost","html":"ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.","enumerator":"2.38","html_id":"general-quadratic-cost","key":"ps1jFwnFGO"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"Similarly, we can also include a\nconstant term ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"WwnbOG6ycK"},{"type":"inlineMath","value":"v_\\hi \\in \\mathbb{R}^{n_\\st}","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"vhRnxv_\\hi \\in \\mathbb{R}^{n_\\st}vhRnx","key":"y1AncqSnjx"},{"type":"text","value":" in the dynamics (note that this is\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"ZCOpqsfoEA"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"Oc03JWzAIs"}],"key":"xagPQuMYqy"},{"type":"text","value":" at each timestep, unlike the stochastic noise ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"ioLtpLzvzQ"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"whw_\\hiwh","key":"TRksXyalOH"},{"type":"text","value":"):","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"hX7i8q8Uly"}],"key":"FffksEFhfj"},{"type":"math","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.","position":{"start":{"line":789,"column":1},"end":{"line":791,"column":1}},"html":"xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.","enumerator":"2.39","key":"XgiZ2Vy5K5"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"exercise","position":{"start":{"line":795,"column":1},"end":{"line":795,"column":1}},"key":"eiI1702qqG"}],"key":"pSsCkj9xLU"},{"type":"paragraph","position":{"start":{"line":796,"column":1},"end":{"line":797,"column":1}},"children":[{"type":"text","value":"Derive the optimal solution. You will need to slightly modify the\nproof in ","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"uaI7FToSpl"},{"type":"crossReference","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"Section ","key":"G5Vmjy14YE"},{"type":"text","value":"2.4","key":"zMdkXvaTaV"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"facWXBUnp9"},{"type":"text","value":".","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"o1aUPGfhL4"}],"key":"OLjX2aW6jF"}],"key":"lmJpbm3j3t"},{"type":"heading","depth":3,"position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"text","value":"Tracking a predefined trajectory","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"PFDSB5Lyaa"}],"identifier":"tracking-a-predefined-trajectory","label":"Tracking a predefined trajectory","html_id":"tracking-a-predefined-trajectory","implicit":true,"enumerator":"2.5.3","key":"qD0L57YEd6"},{"type":"paragraph","position":{"start":{"line":802,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Consider applying LQR to a task like autonomous driving, where the\ntarget state-action pair changes over time. We might want the vehicle to\nfollow a predefined ","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"tEeObmr7Xc"},{"type":"emphasis","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"qSZAuQcHPS"}],"key":"WMo4DCXLvO"},{"type":"text","value":" of states and actions\n","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"GH93nmDerE"},{"type":"inlineMath","value":"(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"html":"(xh,uh)h=0H1(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}(xh,uh)h=0H1","key":"uz1YfYnI9n"},{"type":"text","value":". To express this as a\ncontrol problem, we’ll need a corresponding time-dependent cost\nfunction:","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"f6DVcDFCG8"}],"key":"cnOC9EEEhl"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).","position":{"start":{"line":810,"column":1},"end":{"line":812,"column":1}},"html":"ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).","enumerator":"2.40","key":"ozq7tFdbWc"},{"type":"paragraph","position":{"start":{"line":815,"column":1},"end":{"line":818,"column":1}},"children":[{"type":"text","value":"Note that this punishes states and actions that are far from the\nintended trajectory. By expanding out these multiplications, we can see\nthat this is actually a special case of the more general quadratic cost\nfunction above ","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"Dbol9SEfUZ"},{"type":"crossReference","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"children":[{"type":"text","value":"(","key":"Gg2D5MW5gL"},{"type":"text","value":"2.38","key":"fA89AVphbV"},{"type":"text","value":")","key":"u8JkHpxWPc"}],"identifier":"general_quadratic_cost","label":"general_quadratic_cost","kind":"equation","template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"wbgAG1aRnc"},{"type":"text","value":":","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"ZcDg2GMXov"}],"key":"ScmI6PmzhI"},{"type":"math","value":"M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).","position":{"start":{"line":821,"column":1},"end":{"line":823,"column":1}},"html":"Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).","enumerator":"2.41","key":"iRCGx4fteu"},{"type":"heading","depth":2,"position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Approximating nonlinear dynamics","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"xbPCLnBDrU"}],"label":"approx_nonlinear","identifier":"approx_nonlinear","html_id":"approx-nonlinear","enumerator":"2.6","key":"FW0tWsdv1v"},{"type":"paragraph","position":{"start":{"line":830,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"The LQR algorithm solves for the optimal policy when the dynamics are\n","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"WMV4w0ogwR"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"rRJUqayNhH"}],"key":"gbLxC4CIFq"},{"type":"text","value":" and the cost function is an ","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"puCL3IpkkP"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"EX3NMgqnXa"}],"key":"Gesl8WhhWx"},{"type":"text","value":". However,\nreal settings are rarely this simple! Let’s return to the CartPole\nexample from the start of the chapter\n(","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"fdJ3dW3LYr"},{"type":"crossReference","kind":"proof:example","identifier":"cart_pole","label":"cart_pole","children":[{"type":"text","value":"Example ","key":"LlU6obsIIm"},{"type":"text","value":"2.1","key":"W30nlujzEY"}],"template":"Example %s","enumerator":"2.1","resolved":true,"html_id":"cart-pole","key":"eXtmfFXR9Z"},{"type":"text","value":"). The dynamics (physics) aren’t linear. How\ncan we approximate this by an LQR problem?","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"ZAIGV0JIlV"}],"key":"rDEJ0KCanB"},{"type":"paragraph","position":{"start":{"line":837,"column":1},"end":{"line":840,"column":1}},"children":[{"type":"text","value":"Concretely, let’s consider a ","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"K2XOR6Ax3m"},{"type":"emphasis","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"children":[{"type":"text","value":"noise-free","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"H90jun5Kn4"}],"key":"K29h11IQRF"},{"type":"text","value":" problem since, as we saw, the\nnoise doesn’t factor into the optimal policy. Let’s assume the dynamics\nand cost function are stationary, and ignore the terminal state for\nsimplicity:","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"Hh1zFBHfh3"}],"key":"U7j9np1mZH"},{"type":"proof","kind":"definition","label":"nonlinear_control","identifier":"nonlinear_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Nonlinear control problem","position":{"start":{"line":842,"column":1},"end":{"line":842,"column":1}},"key":"oQQlyA85Ey"}],"key":"mLFXwlXCeQ"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}","position":{"start":{"line":847,"column":1},"end":{"line":855,"column":1}},"html":"minπ0,,πH1:SAEx0[h=0H1c(xh,uh)]wherexh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}π0,,πH1:SAminwhereEx0[h=0H1c(xh,uh)]xh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).","enumerator":"2.42","key":"m3HoohJSFM"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":858,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"p9H8nqbvku"},{"type":"inlineMath","value":"d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"ddd","key":"vcaKKbtl29"},{"type":"text","value":" denotes a function that measures the\n“distance” between its two arguments.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"IXDaK1CLaZ"}],"key":"cZKfH1UA3X"}],"enumerator":"2.8","html_id":"nonlinear-control","key":"GSfxhT07Ux"},{"type":"paragraph","position":{"start":{"line":861,"column":1},"end":{"line":871,"column":1}},"children":[{"type":"text","value":"This is now only slightly simplified from the general optimal control\nproblem (see\n","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"ipg3PXI9MY"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"TLEFPjunhD"},{"type":"text","value":"2.1","key":"WXrXhKbrlt"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"O6lWHoZ2wh"},{"type":"text","value":"). Here, we don’t know an analytical form\nfor the dynamics ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"pavZEZ85wJ"},{"type":"inlineMath","value":"f","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"fff","key":"kdEDCKT5YG"},{"type":"text","value":" or the cost function ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"Ri2V3PbEeA"},{"type":"inlineMath","value":"c","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"ccc","key":"aBawPl2KkB"},{"type":"text","value":", but we assume that we’re\nable to ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"ajDVALB7N4"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"query/sample/simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"eiPt20b1Df"}],"key":"fHtAx5AnCE"},{"type":"text","value":" them to get their values at a given\nstate and action. To clarify, consider the case where the dynamics are\ngiven by real world physics. We can’t (yet) write down an expression for\nthe dynamics that we can differentiate or integrate analytically.\nHowever, we can still ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"BPUuFZAGJV"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"S6FA20Z9KT"}],"key":"aYy2n2F7yq"},{"type":"text","value":" the dynamics and cost function by\nrunning a real-world experiment and measuring the resulting states and\ncosts. How can we adapt LQR to this more general nonlinear case?","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"JNPGZw9qW7"}],"key":"wlXSHKkqWe"},{"type":"heading","depth":3,"position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"children":[{"type":"text","value":"Local linearization","position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"key":"Y9M8hsSPSp"}],"identifier":"local-linearization","label":"Local linearization","html_id":"local-linearization","implicit":true,"enumerator":"2.6.1","key":"ch38JYpBiJ"},{"type":"paragraph","position":{"start":{"line":875,"column":1},"end":{"line":883,"column":1}},"children":[{"type":"text","value":"How can we apply LQR when the dynamics are nonlinear or the cost\nfunction is more complex? We’ll exploit the useful fact that we can take\na function that’s ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"z7u4uico4r"},{"type":"emphasis","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"locally continuous","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"zVivEqVB8U"}],"key":"dMvaMvWw8F"},{"type":"text","value":" around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"ce3ZW8CaQZ"},{"type":"inlineMath","value":"(s^\\star, a^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(s,a)(s^\\star, a^\\star)(s,a)","key":"f1v6nHn3pS"},{"type":"text","value":" and\napproximate it nearby with low-order polynomials (i.e. its Taylor\napproximation). In particular, as long as the dynamics ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"lLhXQZP9Pl"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"rZM1LsnD3F"},{"type":"text","value":" are\ndifferentiable around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"LHmxvolMAa"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"MkS2l4Bnu9"},{"type":"text","value":" and the cost function\n","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"UbfmfgiSmQ"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"S6Mvx4zO5r"},{"type":"text","value":" is twice differentiable at ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"URmC9HGF24"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"U0erW7FIRx"},{"type":"text","value":", we can take a\nlinear approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"vfRymSDczl"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"jWmVUvDNAW"},{"type":"text","value":" and a quadratic approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"UMhjHc3ff7"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"KrjnazVP5W"},{"type":"text","value":" to\nbring us back to the regime of LQR.","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"nWLVB4V7XZ"}],"key":"bdjSuPNDpL"},{"type":"paragraph","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"children":[{"type":"text","value":"Linearizing the dynamics around ","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"rg0vwATW6l"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"ZBtX2G1UXe"},{"type":"text","value":" gives:","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"ZH4SwmxePT"}],"key":"Lf23sHBbj2"},{"type":"math","value":"\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}","position":{"start":{"line":888,"column":1},"end":{"line":893,"column":1}},"html":"f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dfi(x,u)dxj,i,jnx(uf(x,u))ij=dfi(x,u)duj,inx,jnu\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dxjdfi(x,u),i,jnx(uf(x,u))ij=dujdfi(x,u),inx,jnu","enumerator":"2.43","key":"IifOpd2fhV"},{"type":"paragraph","position":{"start":{"line":895,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"and quadratizing the cost function around\n","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"HM3OOwoAcT"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"YRy7JsqPFI"},{"type":"text","value":" gives:","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"ii3vJUkUKV"}],"key":"DsRhTXzfS0"},{"type":"math","value":"\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":908,"column":1}},"html":"c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+12(xx)xxc(x,u)(xx)+12(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)}quadratic terms\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+21(xx)xxc(x,u)(xx)+21(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)quadratic terms","enumerator":"2.44","key":"u0t3bWPhew"},{"type":"paragraph","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"where the gradients and Hessians are defined as","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"key":"AuZ7kQ62aw"}],"key":"SAxPJluj2j"},{"type":"math","value":"\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}","position":{"start":{"line":913,"column":1},"end":{"line":921,"column":1}},"html":"(xc(x,u))i=dc(x,u)dxi,inx(uc(x,u))i=dc(x,u)dui,inu(xxc(x,u))ij=d2c(x,u)dxidxj,i,jnx(uuc(x,u))ij=d2c(x,u)duiduj,i,jnu(xuc(x,u))ij=d2c(x,u)dxiduj.inx,jnu\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}(xc(x,u))i(xxc(x,u))ij(xuc(x,u))ij=dxidc(x,u),inx=dxidxjd2c(x,u),i,jnx=dxidujd2c(x,u).inx,jnu(uc(x,u))i(uuc(x,u))ij=duidc(x,u),inu=duidujd2c(x,u),i,jnu","enumerator":"2.45","key":"ibKJDh5gst"},{"type":"paragraph","position":{"start":{"line":925,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"strong","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"nh3ubW6cz0"}],"key":"PPBFEVillk"},{"type":"text","value":" Note that this cost can be expressed in the general\nquadratic form seen in\n","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"fmOdjnGXjx"},{"type":"crossReference","kind":"equation","identifier":"general_quadratic_cost","label":"general_quadratic_cost","children":[{"type":"text","value":"(","key":"feGJy4Azko"},{"type":"text","value":"2.38","key":"eltKX7jYm8"},{"type":"text","value":")","key":"YLw4hTPGNR"}],"template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"F4ZLz3GX1p"},{"type":"text","value":". Derive the corresponding\nquantities ","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"CVvMi2PNkc"},{"type":"inlineMath","value":"Q, R, M, q, r, c","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"html":"Q,R,M,q,r,cQ, R, M, q, r, cQ,R,M,q,r,c","key":"VEbG8yFt4Q"},{"type":"text","value":".","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"J372wpG1x2"}],"key":"VTk19zolU5"},{"type":"heading","depth":3,"position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Finite differencing","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"JZYMxuTQDH"}],"identifier":"finite-differencing","label":"Finite differencing","html_id":"finite-differencing","implicit":true,"enumerator":"2.6.2","key":"VzzWyveZgb"},{"type":"paragraph","position":{"start":{"line":932,"column":1},"end":{"line":936,"column":1}},"children":[{"type":"text","value":"To calculate these gradients and Hessians in practice,\nwe use a method known as ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"qN9JuLCO4a"},{"type":"strong","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"finite differencing","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"hM3v0fjT2T"}],"key":"tL7lDYyL0c"},{"type":"text","value":" for numerically computing derivatives.\nNamely, we can simply use the limit definition of the derivative, and\nsee how the function changes as we add or subtract a tiny ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"cVS5js6MWV"},{"type":"text","value":"δ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"BOlU7JfkRp"},{"type":"text","value":" to\nthe input.","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"jIoEfXsbuL"}],"key":"o0CHtir5si"},{"type":"math","value":"\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}","position":{"start":{"line":939,"column":1},"end":{"line":941,"column":1}},"html":"ddxf(x)=limδ0f(x+δ)f(x)δ\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}dxdf(x)=δ0limδf(x+δ)f(x)","enumerator":"2.46","key":"AxWZAgbQTK"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":946,"column":1}},"children":[{"type":"text","value":"Note that this only requires us to be able to ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"Ibvqo7fUUq"},{"type":"emphasis","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"query","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"iu1dgbP4Bk"}],"key":"Vi8CPlHZf3"},{"type":"text","value":" the function, not\nto have an analytical expression for it, which is why it’s so useful in\npractice.","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"vSxrYmO8ro"}],"key":"UVjheC0Wo4"},{"type":"heading","depth":3,"position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Local convexification","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"xI2gUIGnYK"}],"identifier":"local-convexification","label":"Local convexification","html_id":"local-convexification","implicit":true,"enumerator":"2.6.3","key":"NI0BCL8Ihs"},{"type":"paragraph","position":{"start":{"line":950,"column":1},"end":{"line":953,"column":1}},"children":[{"type":"text","value":"However, simply taking the second-order approximation of the cost\nfunction is insufficient, since for the LQR setup we required that the\n","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"uAMNbryGrq"},{"type":"inlineMath","value":"Q","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"QQQ","key":"D2rDbhl5Z6"},{"type":"text","value":" and ","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"EQpzIZ7H43"},{"type":"inlineMath","value":"R","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"RRR","key":"JVxzJzI0H6"},{"type":"text","value":" matrices were positive definite, i.e. that all of their\neigenvalues were positive.","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"Ir2RaQYHSj"}],"key":"Yysl1LRWZU"},{"type":"paragraph","position":{"start":{"line":955,"column":1},"end":{"line":960,"column":1}},"children":[{"type":"text","value":"One way to naively ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"Ji5Nhb4UXP"},{"type":"emphasis","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"children":[{"type":"text","value":"force","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"pYAIERGHcI"}],"key":"TBJaBgC0fT"},{"type":"text","value":" some symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"nk8J4MVcOA"},{"type":"inlineMath","value":"D","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DDD","key":"Xu7r4NpdVK"},{"type":"text","value":" to be positive definite\nis to set any non-positive eigenvalues to some small positive value ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"L2Q565FHHX"},{"type":"inlineMath","value":"\\varepsilon > 0","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"ε>0\\varepsilon > 0ε>0","key":"g2b8zgcYdA"},{"type":"text","value":".\nRecall that any real symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"t6KZA5QF7M"},{"type":"inlineMath","value":"D \\in \\mathbb{R}^{n \\times n}","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DRn×nD \\in \\mathbb{R}^{n \\times n}DRn×n","key":"z84mos9Uef"},{"type":"text","value":" has an basis of eigenvectors ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"PmPfDplNIh"},{"type":"inlineMath","value":"u_1, \\dots, u_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"u1,,unu_1, \\dots, u_nu1,,un","key":"m8bHZaJTZo"},{"type":"text","value":"\nwith corresponding eigenvalues ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"dvoiZdxapG"},{"type":"inlineMath","value":"\\lambda_1, \\dots, \\lambda_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"λ1,,λn\\lambda_1, \\dots, \\lambda_nλ1,,λn","key":"rZrUhSs3Ou"},{"type":"text","value":"\nsuch that ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"c168aPctkQ"},{"type":"inlineMath","value":"D u_i = \\lambda_i u_i","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"Dui=λiuiD u_i = \\lambda_i u_iDui=λiui","key":"G2i7Lv6Sbu"},{"type":"text","value":".\nThen we can construct the positive definite approximation by","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"wY3Xqpr53m"}],"key":"JlRRoIyzet"},{"type":"math","value":"\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.","position":{"start":{"line":962,"column":1},"end":{"line":964,"column":1}},"html":"D~=(i=1,,nλi>0λiuiui)+εI.\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.D=i=1,,nλi>0λiuiui+εI.","enumerator":"2.47","key":"vB6xmTAxxN"},{"type":"paragraph","position":{"start":{"line":968,"column":1},"end":{"line":969,"column":1}},"children":[{"type":"strong","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"LSe2AtsMp5"}],"key":"WZ8nWbkdGk"},{"type":"text","value":" Convince yourself that ","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"xDkPv3Ysps"},{"type":"inlineMath","value":"\\widetilde{D}","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"html":"D~\\widetilde{D}D","key":"uNyEEkjRnl"},{"type":"text","value":" is indeed positive\ndefinite.","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"vr3GwrjpWu"}],"key":"VojDWIGT5o"},{"type":"paragraph","position":{"start":{"line":971,"column":1},"end":{"line":977,"column":1}},"children":[{"type":"text","value":"Note that Hessian matrices are generally symmetric, so we can apply this\nprocess to ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"N8yff9vy84"},{"type":"inlineMath","value":"Q","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"QQQ","key":"vIjwHv3Cge"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"NdIFgGX4XC"},{"type":"inlineMath","value":"R","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"RRR","key":"CV5KDd1cu5"},{"type":"text","value":" to obtain the positive definite approximations\n","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"inucTgwbeY"},{"type":"inlineMath","value":"\\widetilde{Q}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"Q~\\widetilde{Q}Q","key":"PP5shwmwGI"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"mimaflcQMN"},{"type":"inlineMath","value":"\\widetilde{R}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"R~\\widetilde{R}R","key":"jFCVPG4S4C"},{"type":"text","value":".\nNow that we have an upward-curved\nquadratic approximation to the cost function, and a linear approximation\nto the state transitions, we can simply apply the time-homogenous LQR\nmethods from ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"u2UpDwIXSq"},{"type":"crossReference","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"children":[{"type":"text","value":"Section ","key":"aOGDbe1UUb"},{"type":"text","value":"2.4","key":"J7pOozfea1"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"O0FGl1AWNO"},{"type":"text","value":".","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"vUhrgTrek2"}],"key":"Ug7Fwttv2I"},{"type":"paragraph","position":{"start":{"line":979,"column":1},"end":{"line":983,"column":1}},"children":[{"type":"text","value":"But what happens when we enter states far away from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"CQwrt6OnnQ"},{"type":"inlineMath","value":"\\st^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"x\\st^\\starx","key":"ZkLoYZvqw5"},{"type":"text","value":" or want\nto use actions far from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"D4MpwZWlBN"},{"type":"inlineMath","value":"\\act^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"u\\act^\\staru","key":"VD8rxv4jrp"},{"type":"text","value":"? A Taylor approximation is only\naccurate in a ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"GKk0hBHCf8"},{"type":"emphasis","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"local","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"baQcEOG08m"}],"key":"P4R0GsTFb6"},{"type":"text","value":" region around the point of linearization, so the\nperformance of our LQR controller will degrade as we move further away.\nWe’ll see how to address this in the next section using the ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"i0B0hkUFyi"},{"type":"strong","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"QSB2QLjhqf"}],"key":"M42nOId6Dp"},{"type":"text","value":" algorithm.","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"QLU1DoImjs"}],"key":"gKsWmQATDW"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.png","alt":"Local linearization might only be accurate in a small region around the\npoint of linearization.","data":{"altTextIsAutoGenerated":true},"key":"Los6zV6OI2","urlSource":"shared/log_taylor.png","urlOptimized":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"local_linearization","identifier":"local_linearization","html_id":"local-linearization","enumerator":"2.3","children":[{"type":"text","value":"Figure ","key":"YrrmdNVGM0"},{"type":"text","value":"2.3","key":"qX2XN0JUXa"},{"type":"text","value":":","key":"kzYHEWUxVj"}],"template":"Figure %s:","key":"nSaeysUfl5"},{"type":"text","value":"Local linearization might only be accurate in a small region around the\npoint of linearization.","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"qgWlg41U80"}],"key":"AiSjG1hFn6"}],"key":"ChjQDPyLtr"}],"label":"local_linearization","identifier":"local_linearization","enumerator":"2.3","html_id":"local-linearization","key":"iqBAtv5dml"},{"type":"heading","depth":3,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"VOJ98fEXJ4"}],"label":"iterative_lqr","identifier":"iterative_lqr","html_id":"iterative-lqr","enumerator":"2.6.4","key":"etV1gH2rM7"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"To address these issues with local linearization, we’ll use an iterative\napproach, where we repeatedly linearize around different points to\ncreate a ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"cZSPy4NiYs"},{"type":"emphasis","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"nlWEaQ2vmv"}],"key":"sBymMncmNU"},{"type":"text","value":" approximation of the dynamics, and then solve\nthe resulting time-dependent LQR problem to obtain a better policy. This\nis known as ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Fx8iPPIbQz"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Tnpehjpb2t"}],"key":"Eri962TgVf"},{"type":"text","value":" or ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"JO1KrvUknE"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iLQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Kc3RWwVG6f"}],"key":"vgqcbs6tm9"},{"type":"text","value":":","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Urnowon83I"}],"key":"j4QgEXq5TW"},{"type":"proof","kind":"definition","label":"ilqr","identifier":"ilqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"O6UkaBpGp0"}],"key":"vpNnR64jkN"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"For each iteration of the algorithm:","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"QyOzp6guxc"}],"key":"xwEApzY4Gj"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":1006,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1006,"column":1},"end":{"line":1007,"column":1}},"children":[{"type":"text","value":"Form a time-dependent LQR problem around the current candidate\ntrajectory using local linearization.","position":{"start":{"line":1006,"column":1},"end":{"line":1006,"column":1}},"key":"FQiZdT0MV7"}],"key":"CuRdLp7a3q"},{"type":"listItem","spread":true,"position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Compute the optimal policy using ","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"ze6bIgg7Hs"},{"type":"crossReference","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Section ","key":"STSgiiUG1s"},{"type":"text","value":"2.5.1","key":"i1WTPl2oS5"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"or1G6RGBgb"},{"type":"text","value":".","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"H77lUDj34O"}],"key":"GCXG1NeSCA"},{"type":"listItem","spread":true,"position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"children":[{"type":"text","value":"Generate a new series of actions using this policy.","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"rjWPpUWR1P"}],"key":"WHRqhz8IWV"},{"type":"listItem","spread":true,"position":{"start":{"line":1010,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"Compute a better candidate trajectory by interpolating between the\ncurrent and proposed actions.","position":{"start":{"line":1010,"column":1},"end":{"line":1010,"column":1}},"key":"SxyXIVTL2M"}],"key":"jVauTbypAU"}],"key":"ylLsX3G1s9"}],"enumerator":"2.9","html_id":"ilqr","key":"ezfHymlohw"},{"type":"paragraph","position":{"start":{"line":1014,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Now let’s go through the details of each step. We’ll use superscripts to\ndenote the iteration of the algorithm. We’ll also denote\n","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"DWkVlxkmBY"},{"type":"inlineMath","value":"\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"html":"xˉ0=Ex0μ0[x0]\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]xˉ0=Ex0μ0[x0]","key":"EguELmqWFk"},{"type":"text","value":" as the expected initial\nstate.","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"VDr5xjVCJ3"}],"key":"kiOuXB5nHE"},{"type":"paragraph","position":{"start":{"line":1019,"column":1},"end":{"line":1021,"column":1}},"children":[{"type":"text","value":"At iteration ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"Q8Jaak58OE"},{"type":"inlineMath","value":"i","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"iii","key":"tZK7LihEkL"},{"type":"text","value":" of the algorithm, we begin with a ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"uStHDe4qR1"},{"type":"strong","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"children":[{"type":"text","value":"candidate","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"pW6JRWdGRM"}],"key":"YyjEMWv45t"},{"type":"text","value":"\ntrajectory\n","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"bXbRSnIZdD"},{"type":"inlineMath","value":"\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)","key":"I1zhuHTclJ"},{"type":"text","value":".","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"hS2bAcDYia"}],"key":"npv2A8pBbh"},{"type":"paragraph","position":{"start":{"line":1023,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Step 1: Form a time-dependent LQR problem.","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"QEGfwEyVA7"}],"key":"numpzIWCmn"},{"type":"text","value":" At each timestep\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"SRraPLn6We"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"kT3CfFb71n"},{"type":"text","value":", we use the techniques from\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"T9aE7xBIdD"},{"type":"crossReference","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Section ","key":"eREQq5FIvF"},{"type":"text","value":"2.6","key":"N2dATEx9qw"}],"identifier":"approx_nonlinear","label":"approx_nonlinear","kind":"heading","template":"Section %s","enumerator":"2.6","resolved":true,"html_id":"approx-nonlinear","key":"RoBjIT3uGx"},{"type":"text","value":" to linearize the dynamics and\nquadratize the cost function around ","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"nwyZyTPUzt"},{"type":"inlineMath","value":"(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"(xˉhi,uˉhi)(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)(xˉhi,uˉhi)","key":"v4Puumsenn"},{"type":"text","value":":","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"XIQceHG1UP"}],"key":"Wv9ViZwSYl"},{"type":"math","value":"\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}","position":{"start":{"line":1029,"column":1},"end":{"line":1049,"column":1}},"html":"fh(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)ch(x,u)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+12[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}fh(x,u)ch(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+21[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].","enumerator":"2.48","key":"NRDps4YXVP"},{"type":"paragraph","position":{"start":{"line":1053,"column":1},"end":{"line":1056,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Step 2: Compute the optimal policy.","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"lPBSA2vdPT"}],"key":"fCY3Ve5HUg"},{"type":"text","value":" We can now solve the\ntime-dependent LQR problem using the Riccati equation from\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"v8lWUVv1mb"},{"type":"crossReference","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Section ","key":"ZluGkftnyB"},{"type":"text","value":"2.5.1","key":"RyUuNiPXby"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"a0hZxgQOPf"},{"type":"text","value":" to compute the optimal policy\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"BH3QqqmA1n"},{"type":"inlineMath","value":"\\pi^i_0, \\dots, \\pi^i_{\\hor-1}","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"html":"π0i,,πH1i\\pi^i_0, \\dots, \\pi^i_{\\hor-1}π0i,,πH1i","key":"MPIGGkqxwk"},{"type":"text","value":".","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"uBRm9vasji"}],"key":"ZkMU3Rqwbi"},{"type":"paragraph","position":{"start":{"line":1058,"column":1},"end":{"line":1059,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"children":[{"type":"text","value":"Step 3: Generate a new series of actions.","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"VTaT0xYMN4"}],"key":"CyGqhuO3Kc"},{"type":"text","value":" We can then generate a new\nsample trajectory by taking actions according to this optimal policy:","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"KYicO5Fkzp"}],"key":"mbJmq8zpGY"},{"type":"math","value":"\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"xˉ0i+1=xˉ0,u~h=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,u~h).\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).xˉ0i+1=xˉ0,uh=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,uh).","enumerator":"2.49","key":"BDTpfhBlcd"},{"type":"paragraph","position":{"start":{"line":1067,"column":1},"end":{"line":1068,"column":1}},"children":[{"type":"text","value":"Note that the states are sampled according to the ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"OdRW6aMEjJ"},{"type":"emphasis","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"children":[{"type":"text","value":"true","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"Y1kjxe9fp8"}],"key":"X6uRcwq7Jt"},{"type":"text","value":" dynamics, which\nwe assume we have query access to.","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"QilCSsh8Nl"}],"key":"dYEMqn1H3i"},{"type":"paragraph","position":{"start":{"line":1070,"column":1},"end":{"line":1077,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"Step 4: Compute a better candidate trajectory.","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"sBdmixm3Lc"}],"key":"V3tNydEffr"},{"type":"text","value":", Note that we’ve\ndenoted these actions as ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"Bhp34Oo2iM"},{"type":"inlineMath","value":"\\widetilde \\act_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"u~h\\widetilde \\act_\\hiuh","key":"zkqzuHSGib"},{"type":"text","value":" and aren’t directly using\nthem for the next iteration ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"VagDw4glTW"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉhi+1\\bar \\act^{i+1}_\\hiuˉhi+1","key":"OVvJInVTpb"},{"type":"text","value":". Rather, we want to\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"MshGBwUfKp"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"interpolate","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"sU5MEr16B7"}],"key":"W4M5VK3tmb"},{"type":"text","value":" between them and the actions from the previous iteration\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"SEQWwrubEe"},{"type":"inlineMath","value":"\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉ0i,,uˉH1i\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}uˉ0i,,uˉH1i","key":"OLkstWHo0k"},{"type":"text","value":". This is so that the cost\nwill ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"ZQZ80Jl6Qz"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"increase monotonically,","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"pf2BFDdVwa"}],"key":"QX0d8FBNpH"},{"type":"text","value":" since if the new policy turns out to\nactually be worse, we can stay closer to the previous trajectory. (Can\nyou think of an intuitive example where this might happen?)","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"auC3GimRnP"}],"key":"fDQgCmhfap"},{"type":"paragraph","position":{"start":{"line":1079,"column":1},"end":{"line":1082,"column":1}},"children":[{"type":"text","value":"Formally, we want to find ","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"CYZE90zudM"},{"type":"inlineMath","value":"\\alpha \\in [0, 1]","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"α[0,1]\\alpha \\in [0, 1]α[0,1]","key":"IDzg6wIXhB"},{"type":"text","value":" to generate the next\niteration of actions\n","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"hPhb7FEyFS"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"uˉ0i+1,,uˉH1i+1\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}uˉ0i+1,,uˉH1i+1","key":"c6HCJuVOxK"},{"type":"text","value":" such that the cost\nis minimized:","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"gEbkXFpTYG"}],"key":"tmm9LtjdCS"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}","position":{"start":{"line":1084,"column":1},"end":{"line":1091,"column":1}},"html":"minα[0,1]h=0H1c(xh,uˉhi+1)wherexh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)u~hx0=xˉ0.\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}α[0,1]minwhereh=0H1c(xh,uˉhi+1)xh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)uhx0=xˉ0.","enumerator":"2.50","key":"BeNXKZO5eN"},{"type":"paragraph","position":{"start":{"line":1093,"column":1},"end":{"line":1095,"column":1}},"children":[{"type":"text","value":"Note that this optimizes over the closed interval\n","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"ncGfPAoXhU"},{"type":"inlineMath","value":"[0, 1]","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"html":"[0,1][0, 1][0,1]","key":"xOo13D918e"},{"type":"text","value":", so by the Extreme Value Theorem, it’s guaranteed to have a\nglobal maximum.","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"AmIiwkOLcF"}],"key":"Ow5jkVOs9f"},{"type":"paragraph","position":{"start":{"line":1097,"column":1},"end":{"line":1101,"column":1}},"children":[{"type":"text","value":"The final output of this algorithm is a policy ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"OYoRWtxTrT"},{"type":"inlineMath","value":"\\pi^{n_\\text{steps}}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"πnsteps\\pi^{n_\\text{steps}}πnsteps","key":"uXg2sHuviC"},{"type":"text","value":"\nderived after ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"Vz7I4iWUwz"},{"type":"inlineMath","value":"n_\\text{steps}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"nstepsn_\\text{steps}nsteps","key":"z7kiHhzZM9"},{"type":"text","value":" of the algorithm. Though the proof is\nsomewhat complex, one can show that for many nonlinear control problems,\nthis solution converges to a locally optimal solution (in the policy\nspace).","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"VNN53dzx2k"}],"key":"E9ygrHZ2H8"},{"type":"heading","depth":2,"position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"key":"esy1LyedNK"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"2.7","key":"HZ3LpQ2bDe"},{"type":"paragraph","position":{"start":{"line":1105,"column":1},"end":{"line":1112,"column":1}},"children":[{"type":"text","value":"This chapter introduced some approaches to solving different variants of\nthe optimal control problem\n","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"OZtEnCJyjh"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"D5HFcEo3R8"},{"type":"text","value":"2.1","key":"GRAyKN8avm"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"yGPnMzzZZJ"},{"type":"text","value":". We began with the simple case of linear\ndynamics and an upward-curved quadratic cost. This model is called the\nLQR and we solved for the optimal policy using dynamic programming. We\nthen extended these results to the more general nonlinear case via local\nlinearization. We finally saw the iterative LQR algorithm for solving\nnonlinear control problems.","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"Fha7hSVGeN"}],"key":"ntfkzLTUpY"}],"key":"grFmYqNCjU"}],"key":"tz7iNf2ntI"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"6b86f35044831ffbe0cf07af5eee27ce5d28fea0397ecdc730ddaa67506611c3","slug":"control","location":"/control.md","dependencies":[],"frontmatter":{"title":"2 Linear Quadratic Regulators","numbering":{"all":{"enabled":true},"enumerator":{"template":"2.%s"}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"x"},"\\act":{"macro":"u"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","thumbnail":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","thumbnailOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp","exports":[{"format":"md","filename":"control.md","url":"/build/control-a8c1e7d39cf806d9a073317a2544cfca.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"Ozq4ciGdi1"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"2.1","key":"qLvZ9sth7f"},{"type":"paragraph","position":{"start":{"line":23,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Up to this point, we have considered decision problems with finitely\nmany states and actions. However, in many applications, states and\nactions may take on continuous values. For example, consider autonomous\ndriving, controlling a robot’s joints, and automated manufacturing. How\ncan we teach computers to solve these kinds of problems? This is the\ntask of ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"H9l2HeRrNA"},{"type":"strong","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"JlA1YZ7SVw"}],"key":"uFXMWhvVU8"},{"type":"text","value":".","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"tCErFUZzxV"}],"key":"xHodts1xt1"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","alt":"Solving a Rubik’s Cube with a robot hand.","data":{"altTextIsAutoGenerated":true},"key":"pF8XNVzcWA","urlSource":"shared/rubiks_cube.jpg","urlOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"control_examples","identifier":"control_examples","html_id":"control-examples","enumerator":"2.1","children":[{"type":"text","value":"Figure ","key":"n83KFPeBim"},{"type":"text","value":"2.1","key":"CdHDsj4dKo"},{"type":"text","value":":","key":"hXELk7kO9W"}],"template":"Figure %s:","key":"HD6armt0Vh"},{"type":"text","value":"Solving a Rubik’s Cube with a robot hand.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"OcZVucjRs6"}],"key":"WiHIVxlFfB"}],"key":"Y0TXg1iTAH"}],"label":"control_examples","identifier":"control_examples","enumerator":"2.1","html_id":"control-examples","key":"qoKJgR8luc"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.jpg","alt":"Boston Dynamics’s Spot robot.","data":{"altTextIsAutoGenerated":true},"key":"wimyN3U2Ta","urlSource":"shared/boston_dynamics.jpg","urlOptimized":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"robot_hand","identifier":"robot_hand","html_id":"robot-hand","enumerator":"2.2","children":[{"type":"text","value":"Figure ","key":"Wg2pklxrT8"},{"type":"text","value":"2.2","key":"csJEOWNdbp"},{"type":"text","value":":","key":"RBWDojPskN"}],"template":"Figure %s:","key":"SNth9r3RdV"},{"type":"text","value":"Boston Dynamics’s Spot robot.","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"biSzHMKYfb"}],"key":"dvuSGOecTM"}],"key":"UC5GcxswI3"}],"label":"robot_hand","identifier":"robot_hand","enumerator":"2.2","html_id":"robot-hand","key":"FNsLonCLh3"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":46,"column":1}},"children":[{"type":"text","value":"Aside from the change in the state and action spaces, the general\nproblem setup remains the same: we seek to construct an ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"zncLho35nY"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"lYC80xWn3E"}],"key":"Cm5dMLMMn5"},{"type":"text","value":"\nthat outputs actions to solve the desired task. We will see that many\nkey ideas and algorithms, in particular dynamic programming algorithms,\ncarry over to this new setting.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"RbAXMaK2g1"}],"key":"lL6TdiyWAh"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"text","value":"This chapter introduces a fundamental tool to solve a simple class of\ncontinuous control problems: the ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"lgUSjHoYdZ"},{"type":"strong","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"AphPa29FpB"}],"key":"nUTtFRbyiw"},{"type":"text","value":". We will\nthen extend this basic method to more complex settings.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"W1tQe1Dk3C"}],"key":"JgQKA1VHbx"},{"type":"proof","kind":"example","label":"cart_pole","identifier":"cart_pole","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"CartPole","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"tsTNQ5I1xO"}],"key":"e2xfbbT423"},{"type":"paragraph","position":{"start":{"line":55,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"text","value":"Try to balance a pencil on its point on a flat surface. It’s much more\ndifficult than it may first seem: the position of the pencil varies\ncontinuously, and the state transitions governing the system, i.e. the\nlaws of physics, are highly complex. This task is equivalent to the\nclassic control problem known as ","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"Lh4G1hGku3"},{"type":"emphasis","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"children":[{"type":"text","value":"CartPole","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"KuyYPK9E6c"}],"key":"eEiWwZKMir"},{"type":"text","value":":","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"L4x3XaDexH"}],"key":"ISq6T9L97S"},{"type":"image","url":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.png","width":"200px","align":"center","key":"PR26sS10jT","urlSource":"shared/cart_pole.png","urlOptimized":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.webp"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"The state ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"RGT3Eq1Uv0"},{"type":"inlineMath","value":"\\st \\in \\mathbb{R}^4","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"html":"xR4\\st \\in \\mathbb{R}^4xR4","key":"vKNctjj4m7"},{"type":"text","value":" can be described by:","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"goQKI08T8r"}],"key":"Kye7Ag47aO"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":67,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":67,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"the position of the cart;","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"DVfaSc0nmp"}],"key":"a1NtkO8I3k"}],"key":"ltZCWZsYlI"},{"type":"listItem","spread":true,"position":{"start":{"line":69,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"text","value":"the velocity of the cart;","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"Km7yAsEvjB"}],"key":"h5iwLUDngd"}],"key":"oa69zp4XUZ"},{"type":"listItem","spread":true,"position":{"start":{"line":71,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"the angle of the pole;","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"b830U00qBZ"}],"key":"rhHHBhCfun"}],"key":"JkunOcXF8v"},{"type":"listItem","spread":true,"position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"the angular velocity of the pole.","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"mh48tDz0a9"}],"key":"lQS0GbDRvF"}],"key":"w63z5rYBYZ"}],"key":"jwiQ5R7foW"},{"type":"paragraph","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"We can ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"Vjp3U1Mbdm"},{"type":"emphasis","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"control","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"pd9c7MT1zM"}],"key":"EkGrAwfulW"},{"type":"text","value":" the cart by applying a horizontal force ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"HltSNiv2R2"},{"type":"inlineMath","value":"\\act \\in \\mathbb{R}","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"html":"uR\\act \\in \\mathbb{R}uR","key":"vi18PgUehN"},{"type":"text","value":".","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"qd0fCL7pQ3"}],"key":"stTOjNz2jk"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Goal:","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"bssWvVa3j7"}],"key":"nvMmakl0St"},{"type":"text","value":" Stabilize the cart around an ideal state and action\n","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"QQF0RBKr4x"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"xFya7Ldzw3"},{"type":"text","value":".","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"CkuQarCihN"}],"key":"mWErd8PTDh"}],"enumerator":"2.1","html_id":"cart-pole","key":"LA187fAXwr"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Optimal control","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"ouS4NmfdMB"}],"identifier":"optimal-control","label":"Optimal control","html_id":"optimal-control","implicit":true,"enumerator":"2.2","key":"jyh7Ab6Ret"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"Recall that an MDP is defined by its state space ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"fqngj6J2dd"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"lrxghbqROf"},{"type":"text","value":", action space\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"tY4DwpazYU"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"A\\mathcal{A}A","key":"shGmvyUF7A"},{"type":"text","value":", state transitions ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"bNtOBIOixO"},{"type":"inlineMath","value":"P","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"PPP","key":"Xxbs5tZac7"},{"type":"text","value":", reward function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"DXBxu809ig"},{"type":"inlineMath","value":"r","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"rrr","key":"gmUmtKuFN9"},{"type":"text","value":", and discount factor\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"jYyFUgQhnX"},{"type":"text","value":"γ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"QXbrQ7imEm"},{"type":"text","value":" or time horizon ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"ZTtkcdIdqw"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"H\\horH","key":"ZDkse06FG4"},{"type":"text","value":". These have equivalents in the control\nsetting:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"WbH1sMUPsz"}],"key":"jpxuC9X8T7"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":88,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The state and action spaces are ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"HBA5Cvdn7b"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"VKQywXFXrj"}],"key":"Nkci8aEFmc"},{"type":"text","value":" rather than finite.\nThat is, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"dP1SxbZlD8"},{"type":"inlineMath","value":"\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"SRnx\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}SRnx","key":"nwFb0XSRNF"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"NWUyerhnVB"},{"type":"inlineMath","value":"\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"ARnu\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}ARnu","key":"PX6KgbcwAl"},{"type":"text","value":",\nwhere ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"sQzQjKUNQU"},{"type":"inlineMath","value":"n_\\st","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nxn_\\stnx","key":"NH3ejeNZVu"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ziTqQTw71x"},{"type":"inlineMath","value":"n_\\act","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nun_\\actnu","key":"R9qzBXWLdK"},{"type":"text","value":" are the corresponding dimensions of these\nspaces, i.e. the number of coordinates to specify a single state or\naction respectively.","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"rDvAr6D8f8"}],"key":"MvOWcNTs0i"}],"key":"MV0kYqSWKY"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"We call the state transitions the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Akjyyogay4"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Ti3ZYt9k8O"}],"key":"yGbPNpGKiM"},{"type":"text","value":" of the system. In the\nmost general case, these might change across timesteps and also\ninclude some stochastic ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"yKC5h0uwJj"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"noise","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"PmW2iLMN9M"}],"key":"gUTUgmxD4c"},{"type":"text","value":" ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"vRUvrb7K1O"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"mhp3GQ9H3v"},{"type":"text","value":" at each timestep. We\ndenote these dynamics as the function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EieETguqMl"},{"type":"inlineMath","value":"f_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fhf_\\hifh","key":"ebKQwmcfQe"},{"type":"text","value":" such that\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"SGtW4o75up"},{"type":"inlineMath","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"xh+1=fh(xh,uh,wh)\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)xh+1=fh(xh,uh,wh)","key":"NRpbimho8p"},{"type":"text","value":". Of course, we can\nsimplify to cases where the dynamics are ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"nEicBr3fYC"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"deterministic/noise-free","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"mDKE4La4O3"}],"key":"eQcyWhXRKM"},{"type":"text","value":"\n(no ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"bveGOvN8vF"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"kgu8CoTOOR"},{"type":"text","value":" term) and/or ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"yjhDWgTMBI"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"DoKgz4JLY4"}],"key":"uaMyNLbvuh"},{"type":"text","value":" (the same function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EhhibhFL6E"},{"type":"inlineMath","value":"f","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fff","key":"l0i2tW9TNq"},{"type":"text","value":"\nacross timesteps).","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Zbfzet9hDD"}],"key":"qZUIF1mCdH"}],"key":"Z3dNFJVm3H"},{"type":"listItem","spread":true,"position":{"start":{"line":103,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":103,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"Instead of maximizing the reward function, we seek to minimize the\n","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"fYO9X9tiYB"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"cost function","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"senLEoaSeW"}],"key":"IWaduggdcM"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"qteLIo7pIX"},{"type":"inlineMath","value":"c_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ch:S×ARc_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}ch:S×AR","key":"grb0W1gLAH"},{"type":"text","value":". Often, the cost\nfunction describes ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"FOqqmQoYzy"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"how far away","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"SrLxNzSZuG"}],"key":"Kvpkm7cZ1T"},{"type":"text","value":" we are from a ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"UeqiowV5AE"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"target\nstate-action pair","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"BQxb1cTjPe"}],"key":"kUtf1v1zu3"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"yfB1lRmq3r"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"ihcXkGoYkC"},{"type":"text","value":". An important special\ncase is when the cost is ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"UP9UeIxjz3"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"wzKe8eHyTs"}],"key":"iZZwtZnjX0"},{"type":"text","value":"; that is, it remains the\nsame function ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"ZL6j7E4BdP"},{"type":"inlineMath","value":"c","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ccc","key":"IDB7mEIjI3"},{"type":"text","value":" at each timestep ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"Tl8z46rNZt"},{"type":"inlineMath","value":"h","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"hhh","key":"kTA3hNnkqO"},{"type":"text","value":".","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"P96yiyhxG1"}],"key":"mOwItKWgK6"}],"key":"TcS2POaa80"},{"type":"listItem","spread":true,"position":{"start":{"line":110,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":110,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"We seek to minimize the ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"hICsKMSkdL"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"undiscounted","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"oPlMjYUkG9"}],"key":"SPucjwAShE"},{"type":"text","value":" cost within a ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"maQ6I50cIx"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"finite time\nhorizon","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"RcwkQQ7vw9"}],"key":"hBj1F25pPr"},{"type":"text","value":" ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"yguLLXPQ9H"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"H\\horH","key":"AyxGkhzSO2"},{"type":"text","value":". Note that we end an episode at the final state\n","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"HbaNCwB6ZW"},{"type":"inlineMath","value":"\\st_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"xH\\st_\\horxH","key":"Jik6AFOKNy"},{"type":"text","value":" -- there is no ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"SxyIXvkssS"},{"type":"inlineMath","value":"\\act_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"uH\\act_\\horuH","key":"hhCJ6c1xzl"},{"type":"text","value":", and so we denote the cost for\nthe final state as ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"GPMwTE49Vy"},{"type":"inlineMath","value":"c_\\hor(\\st_\\hor)","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"cH(xH)c_\\hor(\\st_\\hor)cH(xH)","key":"wF30bdjyEb"},{"type":"text","value":".","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"TSnQnV3JRg"}],"key":"Gqeb21lIap"}],"key":"OSN2mTs6pd"}],"key":"cFxKWlz9zx"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"With all of these components, we can now formulate the ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"FTU3CjErmE"},{"type":"strong","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"optimal control\nproblem:","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"lm98kXdC2k"}],"key":"fzdbBOFRRB"},{"type":"text","value":" ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"nUSie1cdWu"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"compute a policy to minimize the expected undiscounted cost\nover ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"zzijsqOlCI"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"H\\horH","key":"w8K6Mm2yO8"},{"type":"text","value":" timesteps.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"ys4fGS4pHh"}],"key":"J6b7FQdsrG"},{"type":"text","value":" In this chapter, we will only consider\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"kDgWXVxoAe"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"deterministic, time-dependent","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"eQI8snCHk8"}],"key":"jBU3iYPpJ0"},{"type":"text","value":" policies\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"AIGvmQEYyE"},{"type":"inlineMath","value":"\\pi = (\\pi_0, \\dots, \\pi_{H-1})","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"π=(π0,,πH1)\\pi = (\\pi_0, \\dots, \\pi_{H-1})π=(π0,,πH1)","key":"UiGhdmWpa0"},{"type":"text","value":" where ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"tjFurioxu5"},{"type":"inlineMath","value":"\\pi_h : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"πh:SA\\pi_h : \\mathcal{S} \\to \\mathcal{A}πh:SA","key":"w51B7TR6P9"},{"type":"text","value":" for each\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"e6W9NJ322M"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"VQH0OzpjH2"},{"type":"text","value":".","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"RAHdMdGSNF"}],"key":"MbZLAQDIsi"},{"type":"proof","kind":"definition","label":"optimal_control","identifier":"optimal_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"f2uyoZWdz9"}],"key":"T99dRr1I4z"},{"type":"math","value":"\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}","position":{"start":{"line":125,"column":1},"end":{"line":135,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1ch(xh,uh))+cH(xH)]wherexh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}π0,,πH1:SAminwhereE[(h=0H1ch(xh,uh))+cH(xH)]xh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise","enumerator":"2.1","key":"fwaJN6M0Ap"}],"enumerator":"2.1","html_id":"optimal-control","key":"jkIEaEnnua"},{"type":"heading","depth":3,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"A first attempt: Discretization","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"zAH5nfKROq"}],"identifier":"a-first-attempt-discretization","label":"A first attempt: Discretization","html_id":"a-first-attempt-discretization","implicit":true,"enumerator":"2.2.1","key":"bSpEdoVdW5"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"Can we solve this problem using tools from the finite MDP setting? If\n","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"MPb8Gy0VhU"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"S\\mathcal{S}S","key":"seOEkies5C"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"Y3EaEvzUkk"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"A\\mathcal{A}A","key":"bilqh6aN4k"},{"type":"text","value":" were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"vImOhQHX0l"},{"type":"crossReference","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"Definition ","key":"L61uZva0LB"},{"type":"text","value":"1.11","key":"yqqfS8SGel"}],"identifier":"pi_star_dp","label":"pi_star_dp","kind":"proof:definition","template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"LKqXtjKqls"},{"type":"text","value":").\nThis inspires us to try ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"aeQB799d9P"},{"type":"emphasis","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"discretizing","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"gRP3BiiKhF"}],"key":"ML5dCIlQbn"},{"type":"text","value":" the\nproblem.","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"fznM4mQAGS"}],"key":"jySCqElvfb"},{"type":"paragraph","position":{"start":{"line":145,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Suppose ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"PJKW4l7xja"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"yvqkPhtBV5"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"JkDS6MCIrl"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"oeTOl2xrhc"},{"type":"text","value":" are bounded, that is,\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"VW6OugSKtq"},{"type":"inlineMath","value":"\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxxSxBx\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\stmaxxSxBx","key":"frWCANOAju"},{"type":"text","value":" and\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"tZGxKDeFTx"},{"type":"inlineMath","value":"\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxuAuBu\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\actmaxuAuBu","key":"BRhTHgRnm2"},{"type":"text","value":". To make ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"be6m1Vvuks"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"d9wPE6v3gc"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"J6r2KzPvfd"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"VGyMVauEw7"},{"type":"text","value":" finite,\nlet’s choose some small positive ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"YGL8g5gyfL"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"dpiUhTSXSV"},{"type":"text","value":", and simply round each\ncoordinate to the nearest multiple of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"oh4HKsDOB7"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"qLaSkQpV6n"},{"type":"text","value":". For example, if\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"RjyMrBgsj4"},{"type":"inlineMath","value":"\\epsilon = 0.01","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"ϵ=0.01\\epsilon = 0.01ϵ=0.01","key":"TbwK1hNt2i"},{"type":"text","value":", then we round each element of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"sMnOccKvdM"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"x\\stx","key":"WZBKtebcSE"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"i96ax4MFcJ"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"u\\actu","key":"zV9JsTEgNC"},{"type":"text","value":" to two\ndecimal spaces.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"J7w1FQFPyX"}],"key":"YoaiSEq1TW"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"However, the discretized ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"wR8X5YUn2J"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{S}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~\\widetilde{\\mathcal{S}}S","key":"eyi6FUTd1z"},{"type":"text","value":" and ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"tEg9qjDvcM"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{A}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~\\widetilde{\\mathcal{A}}A","key":"gb7nrFHqPO"},{"type":"text","value":" may be finite, but\nthey may be infeasibly large: we must divide ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"hxobLFJSDD"},{"type":"emphasis","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"each dimension","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"ypPD3auF1s"}],"key":"uJ3aAEY0Dj"},{"type":"text","value":" into\nintervals of length ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"RSDi5iY3h3"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε\\varepsilonε","key":"ugHxcVnLh3"},{"type":"text","value":", resulting in\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"tl2CuYxS6c"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~=(Bx/ε)nx|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}S=(Bx/ε)nx","key":"wT0MqvyENc"},{"type":"text","value":" and\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"kAdxXyuyZn"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~=(Bu/ε)nu|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}A=(Bu/ε)nu","key":"VfW3NZGFF3"},{"type":"text","value":". To get a sense of how\nquickly this grows, consider ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"Ww66sylDL8"},{"type":"inlineMath","value":"\\varepsilon = 0.01, n_\\st = n_\\act = 10","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε=0.01,nx=nu=10\\varepsilon = 0.01, n_\\st = n_\\act = 10ε=0.01,nx=nu=10","key":"dtVWwXloY5"},{"type":"text","value":".\nThen the number of elements in the transition matrix would be\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"pQhRtzO4JB"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~2A~=(10010)2(10010)=1060|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}S2A=(10010)2(10010)=1060","key":"aYiv6Aog5j"},{"type":"text","value":"! (That’s\na trillion trillion trillion trillion trillion.)","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"MWXrE0DPMX"}],"key":"PWqaYRhwAA"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"text","value":"What properties of the problem could we instead make use of? Note that\nby discretizing the state and action spaces, we implicitly assumed that\nrounding each state or action vector by some tiny amount ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"pWOK1LcDGc"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"html":"ε\\varepsilonε","key":"EXpJyEl6D0"},{"type":"text","value":"\nwouldn’t change the behavior of the system by much; namely, that the\ncost and dynamics were relatively ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"fKtfhMOavH"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"RfWHLiMWXq"}],"key":"u4x7BvaSzl"},{"type":"text","value":". Can we use this\ncontinuous structure in other ways? This leads us to the ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"Ma0zTVgr1j"},{"type":"strong","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"linear\nquadratic regulator","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"k4vomIm67a"}],"key":"v1UZDH8dtG"},{"type":"text","value":".","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"qCZrSJojNn"}],"key":"A2NzujTiLO"},{"type":"heading","depth":2,"position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"text","value":"The Linear Quadratic Regulator","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"key":"JXEkabJYua"}],"label":"lqr","identifier":"lqr","html_id":"lqr","enumerator":"2.3","key":"IS2T3hx6yE"},{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"The optimal control problem ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"ZMnHGV633A"},{"type":"crossReference","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Definition ","key":"T6Ivgnpcv3"},{"type":"text","value":"2.1","key":"yDkHWL1PiG"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"bPyg4RdtDU"},{"type":"text","value":" seems highly complex in general. Is there a relevant simplification that we can analyze?\nThe ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"QaN40TSD5P"},{"type":"strong","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"Xe7HoHxuqN"}],"key":"ERl8a6Tzsy"},{"type":"text","value":" (LQR) is a solvable case and a fundamental tool in control theory.","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"QuwogqDpof"}],"key":"GctgDz8Uhq"},{"type":"proof","kind":"definition","label":"lqr_definition","identifier":"lqr_definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The linear quadratic regulator","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"ZOhjinbHZY"}],"key":"AAAh2bncY7"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"The LQR problem is a special case of the ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"fLc9YTOOWb"},{"type":"crossReference","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"x3agUhaXQf"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"IqfoEopYck"},{"type":"text","value":" with ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"rx8Jk2TJpd"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"linear dynamics","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"jaF1PEbLYT"}],"key":"Ur5K5nDSan"},{"type":"text","value":" and an ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"z0zMP3KVlo"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic cost function","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"CXplrGTKcz"}],"key":"pyGqHeCJCt"},{"type":"text","value":".\nSolving the LQR problem will additionally enable us to ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"SVgeLtyXqD"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"BHgLdtEtJJ"}],"key":"UQwhG8PJjy"},{"type":"text","value":" more complex setups using ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"aGmbugqIYE"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"rvn80FMtCe"}],"key":"dG5uddVd8W"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"TetFERvD6A"}],"key":"yZAcqqtCnG"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"strong","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"Linear, time-homogeneous dynamics","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"R8fdSgblyc"}],"key":"CEp6koHJ6v"},{"type":"text","value":": for each timestep ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"qjwV3okNOK"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"RBexxdOdBU"},{"type":"text","value":",","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"kq3iK8bDiM"}],"key":"NOXhFrrA9b"},{"type":"math","value":"\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"html":"xh+1=f(xh,uh,wh)=Axh+Buh+whwhere whN(0,σ2I).\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}xh+1where wh=f(xh,uh,wh)=Axh+Buh+whN(0,σ2I).","enumerator":"2.2","key":"XXz1piC84x"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"uiSL0EcD6h"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"whw_\\hiwh","key":"PH1aPUtEH6"},{"type":"text","value":" is a spherical Gaussian ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"dhCwdatJPP"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"noise term","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"UzK1t1fgpc"}],"key":"aBEbVP1ESp"},{"type":"text","value":" that makes the dynamics random.\nSetting ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"eCzvob2U7J"},{"type":"inlineMath","value":"\\sigma = 0","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"σ=0\\sigma = 0σ=0","key":"kzYB4TUAn2"},{"type":"text","value":" gives us ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"l4fp6b4gRV"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"jfjKBIKQlp"}],"key":"QhEDKK7JqH"},{"type":"text","value":" state transitions.\nWe will find that the optimal policy actually ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"vSayJS5gaR"},{"type":"emphasis","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"does not depend on the noise","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"AHb9qrhVYU"}],"key":"yRycskUoyu"},{"type":"text","value":", although the optimal value function and Q-function do.","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"wO61HEHjnh"}],"key":"pdf7X4Dqa7"},{"type":"paragraph","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"strong","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"text","value":"Upward-curved quadratic, time-homogeneous cost function","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"Eg7W18bxAx"}],"key":"cRjn0tjgGJ"},{"type":"text","value":":","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"kOsp1JiQmM"}],"key":"FQnsoXcMWI"},{"type":"math","value":"c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.","position":{"start":{"line":198,"column":1},"end":{"line":203,"column":1}},"html":"c(xh,uh)={xhQxh+uhRuhh<HxhQxhh=H.c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.c(xh,uh)={xhQxh+uhRuhxhQxhh<Hh=H.","enumerator":"2.3","key":"JgOeuETMhm"},{"type":"paragraph","position":{"start":{"line":205,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"This cost function attempts to stabilize the state and action about ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"X2SYIbe8bP"},{"type":"inlineMath","value":"(s^\\star, a^\\star) = (0, 0)","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"(s,a)=(0,0)(s^\\star, a^\\star) = (0, 0)(s,a)=(0,0)","key":"JsaSplFx8c"},{"type":"text","value":".\nWe require ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"JshBrQtGiM"},{"type":"inlineMath","value":"Q \\in \\R^{n_\\st \\times n_\\st}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"QRnx×nxQ \\in \\R^{n_\\st \\times n_\\st}QRnx×nx","key":"BKVsiTB5sc"},{"type":"text","value":" and ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"LAVSDwx1R9"},{"type":"inlineMath","value":"R \\in \\R^{n_\\act \\times n_\\act}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"RRnu×nuR \\in \\R^{n_\\act \\times n_\\act}RRnu×nu","key":"luEFm2tRUo"},{"type":"text","value":" to both be ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"IrjN1X7NyS"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"positive definite","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"eBBlTlQtmn"}],"key":"DTX5lrsaQZ"},{"type":"text","value":" matrices so that ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"ljZrilSB18"},{"type":"inlineMath","value":"c","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"ccc","key":"lwSyDYJ2Pw"},{"type":"text","value":" has a well-defined unique minimum.\nWe can furthermore assume without loss of generality that they are both ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"wReUlWoGrA"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"symmetric","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"dwZnOXjGiv"}],"key":"mR9r8ju3Z3"},{"type":"text","value":" (see exercise below).","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"aTuomLE6CR"}],"key":"yuAw3noYIZ"},{"type":"paragraph","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"This results in the LQR optimization problem:","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"YkDlq3jkji"}],"key":"skH2DYMPbk"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}","position":{"start":{"line":211,"column":1},"end":{"line":219,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1xhQxh+uhRuh)+xHQxH]wherexh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}π0,,πH1:SAminwhereE[(h=0H1xhQxh+uhRuh)+xHQxH]xh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.","enumerator":"2.4","key":"RRzavaYicJ"}],"enumerator":"2.2","html_id":"lqr-definition","key":"Fw54VGC4e2"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"HqBKv6nNCp"}],"key":"tUFhNDFfJ7"},{"type":"paragraph","position":{"start":{"line":223,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"Here we’ll show that we don’t lose generality by assuming that ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"rIAQ4ptvv8"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"iqAaQRpoVZ"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"IzQAJhR8ER"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"JCgBTSwsLk"},{"type":"text","value":" are symmetric.\nShow that replacing ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"S1EM3oQFW4"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"tr6WuyQ17u"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"YvKtoAG505"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"dnze3IPl5Y"},{"type":"text","value":" with ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"AQBF7vmjjn"},{"type":"inlineMath","value":"(Q + Q^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(Q+Q)/2(Q + Q^\\top) / 2(Q+Q)/2","key":"ItH1FiI3x6"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"f2s2bCkitB"},{"type":"inlineMath","value":"(R + R^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(R+R)/2(R + R^\\top) / 2(R+R)/2","key":"T2qUisiMcc"},{"type":"text","value":" (which are symmetric) yields the same cost function.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"bJ2BEH3rAX"}],"key":"nULxYNK56q"}],"key":"v2N5G1ey91"},{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"We will henceforth abbreviate “symmetric positive definite” as s.p.d.\nand “positive definite” as p.d.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"SpPa9VXUTv"}],"key":"YoKtbfABip"},{"type":"paragraph","position":{"start":{"line":230,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"It will be helpful to reintroduce the ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"AJjTbw6rYR"},{"type":"emphasis","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"KWMYuKfjA2"}],"key":"ZoqQxhYzEs"},{"type":"text","value":" notation for a policy to denote the average cost it incurs.\nThese will be instrumental in constructing the optimal policy via ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"dpmEKnRbAP"},{"type":"strong","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"dynamic programming,","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"oYw0w88BUN"}],"key":"s7nqQgGtP5"},{"type":"text","value":"\nas we did in ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"ZDmm1Kq0l1"},{"type":"crossReference","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Section ","key":"Oj0yTJGoJM"},{"type":"text","value":"1.3.2","key":"dYkyroOawv"}],"identifier":"opt_dynamic_programming","label":"opt_dynamic_programming","kind":"heading","template":"Section %s","enumerator":"1.3.2","resolved":true,"html_id":"opt-dynamic-programming","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"Ws7WwAhUgF"},{"type":"text","value":" for MDPs.","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"vS9Tl2yU1Z"}],"key":"bcdm9v9aIq"},{"type":"proof","kind":"definition","label":"value_lqr","identifier":"value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value functions for LQR","position":{"start":{"line":234,"column":1},"end":{"line":234,"column":1}},"key":"YTR8fxA1dr"}],"key":"IyJi4BNEU5"},{"type":"paragraph","position":{"start":{"line":237,"column":1},"end":{"line":238,"column":1}},"children":[{"type":"text","value":"Given a policy ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"olZDZ0Duql"},{"type":"inlineMath","value":"\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"π=(π0,,πH1)\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})π=(π0,,πH1)","key":"YYJQMYVdCU"},{"type":"text","value":",\nwe can define its value function ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"Z1FzTbRhA8"},{"type":"inlineMath","value":"V^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"Vhπ:SRV^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}Vhπ:SR","key":"HjH2dUG2OY"},{"type":"text","value":" at time ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"qXfUEeeW3R"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"qVLZ2fa4aa"},{"type":"text","value":" as the average ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"qJpMHIJk0u"},{"type":"strong","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"children":[{"type":"text","value":"cost-to-go","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"GgWb6YNbWc"}],"key":"SMMPzgNo0v"},{"type":"text","value":" incurred by that policy:","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"n9Sp3os5od"}],"key":"gAYYkJTjZc"},{"type":"math","value":"\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}","position":{"start":{"line":240,"column":1},"end":{"line":245,"column":1}},"html":"Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.5","key":"odxKEqLER6"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"The Q-function additionally conditions on the first action we take:","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"OAWL8HXW9e"}],"key":"A0Oa8pUEOC"},{"type":"math","value":"\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":249,"column":1},"end":{"line":256,"column":1}},"html":"Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]","enumerator":"2.6","key":"nOo6bkPwmE"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"Note that since we use ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"HGqmjnsnkb"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"cost","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"D5EI5vTzfZ"}],"key":"WimLhigFzy"},{"type":"text","value":" instead of ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"JXVyY1Dhln"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"reward,","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"a8uI9pWd6e"}],"key":"V4AJOmi3uM"},{"type":"text","value":"\nthe best policies are the ones with ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"Duiq2r4ro0"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"smaller","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"lkLUHbpcFD"}],"key":"rHjOmTDraO"},{"type":"text","value":" values of the value function.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"Y6b37IQBeh"}],"key":"m9lQNd4xqz"}],"enumerator":"2.3","html_id":"value-lqr","key":"IfQRGFqMTK"},{"type":"heading","depth":2,"position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"children":[{"type":"text","value":"Optimality and the Riccati Equation","position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"key":"hNjtSOUr5l"}],"label":"optimal_lqr","identifier":"optimal_lqr","html_id":"optimal-lqr","enumerator":"2.4","key":"XcKGGFtSyK"},{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"In this section,\nwe’ll compute the optimal value function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"M9xcwjth9k"},{"type":"inlineMath","value":"V^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"VhV^\\star_hVh","key":"XSs69xnTgt"},{"type":"text","value":",\nQ-function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"XSnPDZpwYu"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"QhQ^\\star_hQh","key":"eLCrq0izGO"},{"type":"text","value":",\nand policy ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"CVHr1bngZp"},{"type":"inlineMath","value":"\\pi^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"πh\\pi^\\star_hπh","key":"kmUcJzrAaQ"},{"type":"text","value":" in ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"fo68474r6B"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"the linear quadratic regulator","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"XtD4yMuur3"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"UAIy08Su1x"},{"type":"text","value":" using ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"DZQLjzrmEm"},{"type":"strong","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"Mdd9UlYvNR"}],"key":"a1qBFdjM4L"},{"type":"text","value":"\nin a very similar way to the DP algorithms ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"fyTw9Jv6YF"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"NtueF0ITZK"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"ruSfrtRL1k"},{"type":"text","value":".\nRecall the definition of the optimal value function:","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"hYIHPpxvoy"}],"key":"tCTzIELKgY"},{"type":"proof","kind":"definition","label":"optimal_value_lqr","identifier":"optimal_value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"HCeRqpxPH5"}],"key":"PAmqLxDImy"},{"type":"paragraph","position":{"start":{"line":275,"column":1},"end":{"line":277,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"QYqcaPm3IM"},{"type":"strong","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"ibpnChbofg"}],"key":"IVK0anFOsj"},{"type":"text","value":" is the one that,\nat any time and in any state,\nachieves ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"B0yfyJs93d"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"minimum cost","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"a85E4RW2X0"}],"key":"Y5efqN5iIq"},{"type":"text","value":" across ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"GYGULcfeT1"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"all policies","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"RSo88bXJ2j"}],"key":"taqPacETsa"},{"type":"text","value":":","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"O3eXWPKoOk"}],"key":"RabuWwvo2c"},{"type":"math","value":"\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":279,"column":1},"end":{"line":285,"column":1}},"html":"Vh(x)=minπh,,πH1Vhπ(x)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Vh(x)=πh,,πH1minVhπ(x)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.7","key":"QbXC8a8FZM"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"text","value":"The optimal Q-function is defined similarly,\nconditioned on the starting action as well:","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"Ff4q12N4jj"}],"key":"eKdH1hj6Ue"},{"type":"math","value":"\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":290,"column":1},"end":{"line":296,"column":1}},"html":"Qh(x,u)=minπh,,πH1Qhπ(x,u)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}Qh(x,u)=πh,,πH1minQhπ(x,u)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]","enumerator":"2.8","key":"JNaecM9MfB"},{"type":"paragraph","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"Both of the definitions above assume ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"Ydn6Qv2CQB"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"O7KTQeHCIi"}],"key":"VXFWfGibml"},{"type":"text","value":" policies. Otherwise we would have to take an ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"fURxXRI64u"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"expectation","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"iWYQapALRO"}],"key":"UF9I1WF284"},{"type":"text","value":" over actions drawn from the policy, i.e. ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"ucCCX4su4R"},{"type":"inlineMath","value":"\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"uhπh(xh)\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)uhπh(xh)","key":"hnVoZrEOXq"},{"type":"text","value":".","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"MmffBcIFSg"}],"key":"m7ZNpUsB6u"}],"enumerator":"2.4","html_id":"optimal-value-lqr","key":"SD4xlZ2N86"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"text","value":"We will prove the striking fact that the solution has very simple structure:\n","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"AxYGIrv34j"},{"type":"inlineMath","value":"V_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"VhV_h^\\starVh","key":"WYBNqz6wit"},{"type":"text","value":" and ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"ksJkfnZ8G3"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"QhQ^\\star_hQh","key":"rsCJtkjikL"},{"type":"text","value":" are ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Eq4IeVCYYM"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"upward-curved quadratics","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"U5qpO4XdN3"}],"key":"IPjbTmRURi"},{"type":"text","value":"\nand ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"pjEXCIz0wX"},{"type":"inlineMath","value":"\\pi_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"πh\\pi_h^\\starπh","key":"FLqnJzStti"},{"type":"text","value":" is ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"hg2LXnDJpP"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"QQ9MgZToTd"}],"key":"lyVxwjuTVg"},{"type":"text","value":" and furthermore does not depend on the noise!","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"hrkIRrAk8C"}],"key":"D8kqrG9qZ0"},{"type":"proof","kind":"theorem","label":"optimal_value_lqr_quadratic","identifier":"optimal_value_lqr_quadratic","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR is an upward-curved quadratic","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"lgojp81Q94"}],"key":"eBNBSD5X8G"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"owMA72YkZb"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"fA2jXd6Rtw"},{"type":"text","value":",","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"mi4e7BM06Z"}],"key":"kT8u4HIZzC"},{"type":"math","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":310,"column":1},"end":{"line":312,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","enumerator":"2.9","key":"kBxvbV5iSN"},{"type":"paragraph","position":{"start":{"line":314,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"for some s.p.d. matrix ","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"VeHZiX4RC8"},{"type":"inlineMath","value":"P_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"PhRnx×nxP_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}PhRnx×nx","key":"q5PnZoOS6r"},{"type":"text","value":" and scalar\n","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"Y6kKwsnAgZ"},{"type":"inlineMath","value":"p_\\hi \\in \\mathbb{R}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"phRp_\\hi \\in \\mathbb{R}phR","key":"c3VqVOhveH"},{"type":"text","value":".","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"EbYyDnvjwt"}],"key":"uMgXoUF1Ft"}],"enumerator":"2.1","html_id":"optimal-value-lqr-quadratic","key":"hDFLlKpCsa"},{"type":"proof","kind":"theorem","label":"optimal_policy_lqr_linear","identifier":"optimal_policy_lqr_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policy in LQR is linear","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"Fy35NvSlXp"}],"key":"pnpo9A1Okj"},{"type":"paragraph","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"s8QBobXyWQ"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"uaUvIljcXq"},{"type":"text","value":",","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"I4exMsqm19"}],"key":"frHWyWrwko"},{"type":"math","value":"\\pi^\\star_\\hi (\\st) = - K_\\hi \\st","position":{"start":{"line":323,"column":1},"end":{"line":325,"column":1}},"html":"πh(x)=Khx\\pi^\\star_\\hi (\\st) = - K_\\hi \\stπh(x)=Khx","enumerator":"2.10","key":"ELU7HnRlKm"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":328,"column":1}},"children":[{"type":"text","value":"for some ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"RzzVcvAaZs"},{"type":"inlineMath","value":"K_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"KhRnu×nxK_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}KhRnu×nx","key":"XkaXdz2BqU"},{"type":"text","value":".\n(The negative is due to convention.)","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"c1MBlYwJhG"}],"key":"ljsF8iYBbl"}],"enumerator":"2.2","html_id":"optimal-policy-lqr-linear","key":"P9a3MB7UPj"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"The construction (and inductive proof) proceeds similarly to the one ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"dvVHlpG2VG"},{"type":"crossReference","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"QGOtLnDdgx"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"UkxSGy2mel"},{"type":"text","value":".","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"BP4xTyDewg"}],"key":"lBizK24rHx"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":333,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"We’ll compute ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"yR7lfkd6bD"},{"type":"inlineMath","value":"V_\\hor^\\star","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"VHV_\\hor^\\starVH","key":"huf0AbbYnl"},{"type":"text","value":" (at the end of the horizon) as our base case.","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"zYI6UqR5CL"}],"key":"ufATJfx6nY"},{"type":"listItem","spread":true,"position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Then we’ll work step-by-step backwards in time, using ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"kCkRI6QJhm"},{"type":"inlineMath","value":"V_{\\hi+1}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"Vh+1V_{\\hi+1}^\\starVh+1","key":"vb3W2ubPNU"},{"type":"text","value":" to compute ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"oQFJRh470O"},{"type":"inlineMath","value":"Q_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"QhQ_\\hi^\\starQh","key":"vv7YZ9la9i"},{"type":"text","value":", ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"a39LY1N4QF"},{"type":"inlineMath","value":"\\pi_{\\hi}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"πh\\pi_{\\hi}^\\starπh","key":"if9FPtSbYT"},{"type":"text","value":", and ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"tv4iQAacm3"},{"type":"inlineMath","value":"V_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"VhV_\\hi^\\starVh","key":"WKiY7pJOls"},{"type":"text","value":".","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"oKCyPiiIqc"}],"key":"I80ixU2VYw"}],"key":"htyk7BB11D"},{"type":"comment","value":" TODO insert reference for proof by induction ","key":"kJ1pUIzztd"},{"type":"paragraph","position":{"start":{"line":338,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"strong","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"children":[{"type":"text","value":"Base case:","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"rEQPN434Am"}],"key":"rvqAgCkh7C"},{"type":"text","value":"\nAt the final timestep,\nthere are no possible actions to take,\nand so ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"GIapAtTE1T"},{"type":"inlineMath","value":"V^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\st","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=c(x)=xQxV^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\stVH(x)=c(x)=xQx","key":"AxXwrYjT3g"},{"type":"text","value":".\nThus ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"LfNmdjaYCd"},{"type":"inlineMath","value":"V_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\hor","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=xPHx+pHV_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\horVH(x)=xPHx+pH","key":"Erouwc52zf"},{"type":"text","value":"\nwhere ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"baSItI5HrZ"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"pwc1hSuLdt"},{"type":"text","value":" and ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"FhKtjCB42l"},{"type":"inlineMath","value":"p_\\hor = 0","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"pH=0p_\\hor = 0pH=0","key":"p2b53Qvbww"},{"type":"text","value":".","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"QFRjvcjqzd"}],"key":"Xf5rufKX5d"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"strong","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"Inductive hypothesis:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"oC1dCZzh9E"}],"key":"qvC9Ct6E5T"},{"type":"text","value":"\nWe seek to show that the inductive step holds for both theorems:\nIf ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"YEYbK4YKRH"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh+1(x)V^\\star_{\\hi+1}(\\st)Vh+1(x)","key":"idniun1nsS"},{"type":"text","value":" is an upward-curved quadratic,\nthen ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"yCjyTFm5bI"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"HNGLcASCWp"},{"type":"text","value":" must also be an upward-curved quadratic,\nand ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"LrOyFlQoGo"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"πh(x)\\pi^\\star_\\hi(\\st)πh(x)","key":"LRHjntc5xk"},{"type":"text","value":" must be linear.\nWe’ll break this down into the following steps:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"AV0Tasae2U"}],"key":"fIgXVgS2zm"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":352,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"MV2oIgFoCL"},{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"p6FePCXCMz"},{"type":"text","value":" is an upward-curved quadratic (in both\n","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"wjZzXtAIlo"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"x\\stx","key":"TwFxijS1PR"},{"type":"text","value":" and ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"fyknaOcltA"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"u\\actu","key":"riCbndbNzP"},{"type":"text","value":").","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"gs3roYBEL8"}],"key":"UpUBFt3uGB"},{"type":"listItem","spread":true,"position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"text","value":"Derive the optimal policy\n","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"tlZZixboZK"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"html":"πh(x)=argminuQh(x,u)\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)πh(x)=argminuQh(x,u)","key":"dBWSNQsx3g"},{"type":"text","value":" and show\nthat it’s linear.","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"QXP4QbNpCM"}],"key":"JgPNbMLT1C"},{"type":"listItem","spread":true,"position":{"start":{"line":357,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"qZ4Cb8PylT"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"pBmoQ4bEvf"},{"type":"text","value":" is an upward-curved quadratic.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"iNpQwOYy4t"}],"key":"LzDVyVAiZ8"}],"key":"gEpjt3MN6g"},{"type":"paragraph","position":{"start":{"line":359,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"text","value":"We first assume the inductive hypothesis that our theorems are true at\ntime ","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"rmUTob4eiR"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"html":"h+1\\hi+1h+1","key":"DuQaeRlisn"},{"type":"text","value":". That is,","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"ugSDUZpB3g"}],"key":"ApaBiYiws3"},{"type":"math","value":"V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.","position":{"start":{"line":362,"column":1},"end":{"line":364,"column":1}},"html":"Vh+1(x)=xPh+1x+ph+1xS.V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.Vh+1(x)=xPh+1x+ph+1xS.","enumerator":"2.11","key":"YrpRP1clTh"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"nAbkUl1FEK"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"key":"RWPxtjUghW"}],"key":"DQUDzjddgg"},{"type":"paragraph","position":{"start":{"line":367,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Let us decompose ","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"uiDkZ1eFdW"},{"type":"inlineMath","value":"Q^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"Qh:S×ARQ^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qh:S×AR","key":"ijz0qUmpLa"},{"type":"text","value":"\ninto the immediate reward plus the expected cost-to-go:","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"PoAyT3WTpl"}],"key":"lTUJf7suH3"},{"type":"math","value":"Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].","position":{"start":{"line":370,"column":1},"end":{"line":372,"column":1}},"html":"Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].","enumerator":"2.12","key":"XEtK3z7bIw"},{"type":"paragraph","position":{"start":{"line":374,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"Recall ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"Jv6Ka1lhY9"},{"type":"inlineMath","value":"c(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\act","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"c(x,u):=xQx+uRuc(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\actc(x,u):=xQx+uRu","key":"SmfDOPKVtN"},{"type":"text","value":".\nLet’s consider the expectation over the next timestep.\nThe only randomness in the dynamics comes from the noise\n","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"yyMrqqASyp"},{"type":"inlineMath","value":"w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"wh+1N(0,σ2I)w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)wh+1N(0,σ2I)","key":"LfqPFMttzN"},{"type":"text","value":",\nso we can expand the expectation as:","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"ysdl42kWpd"}],"key":"euBhLKrybA"},{"type":"math","value":"\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}","position":{"start":{"line":380,"column":1},"end":{"line":386,"column":1}},"html":"Ex[Vh+1(x)]=Ewh+1[Vh+1(Ax+Bu+wh+1)]definition of f=Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].inductive hypothesis\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}==Ex[Vh+1(x)]Ewh+1[Vh+1(Ax+Bu+wh+1)]Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].definition of finductive hypothesis","enumerator":"2.13","key":"F2DCnK20jo"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"text","value":"Summing and combining like terms, we get","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"QjteoNdT6o"}],"key":"Htj8MlcgWk"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":390,"column":1},"end":{"line":396,"column":1}},"html":"Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.","enumerator":"2.14","key":"o4PvqoTGcD"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Note that the terms that are linear in ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"i0fsCO2gAR"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"whw_\\hiwh","key":"OJI5TqjtXD"},{"type":"text","value":" have mean\nzero and vanish. Now consider the remaining expectation over the noise.\nBy expanding out the product and using linearity of expectation, we can\nwrite this out as","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"FuDY7NzARw"}],"key":"zH7Jt3L7XO"},{"type":"math","value":"\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}","position":{"start":{"line":403,"column":1},"end":{"line":408,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)","enumerator":"2.15","key":"rYPE6smifk"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Quadratic forms","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"i3kpJWFfBC"}],"key":"Bx92B7CggG"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"When solving ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"CntcESgbV1"},{"type":"emphasis","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"children":[{"type":"text","value":"quadratic forms","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"SpOyqIr88Q"}],"key":"lGzDAmEHKD"},{"type":"text","value":", i.e. expressions of the form ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"vqAqHI4oAm"},{"type":"inlineMath","value":"x^\\top A x","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"xAxx^\\top A xxAx","key":"mRsE8gkdvt"},{"type":"text","value":",\nit’s often helpful to consider the terms on the diagonal (","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"Ev6B3V64G6"},{"type":"inlineMath","value":"i = j","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"i=ji = ji=j","key":"AfEn3SQbKq"},{"type":"text","value":") separately from those off the diagonal.","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"UvV1glTiNX"}],"key":"vvDnFatqqM"},{"type":"paragraph","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"In this case, the expectation of each diagonal term becomes","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"f3oVo3CWRe"}],"key":"BFW3LI3Ici"},{"type":"math","value":"(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.","position":{"start":{"line":417,"column":1},"end":{"line":419,"column":1}},"html":"(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.","enumerator":"2.16","key":"y1axRRTiIs"},{"type":"paragraph","position":{"start":{"line":421,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"Off the diagonal, since the elements of ","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"ZzajR9SuO9"},{"type":"inlineMath","value":"w_{\\hi+1}","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"html":"wh+1w_{\\hi+1}wh+1","key":"CgNJQnXuEZ"},{"type":"text","value":" are independent, the\nexpectation factors, and since each element has mean zero, the term\nvanishes:","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"JtaA5HuXbu"}],"key":"YJyyxmIFT3"},{"type":"math","value":"(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.","position":{"start":{"line":425,"column":1},"end":{"line":427,"column":1}},"html":"(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.","enumerator":"2.17","key":"TpT3COYtyJ"},{"type":"paragraph","position":{"start":{"line":429,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"text","value":"Thus,\nthe only terms left are the ones on the diagonal,\nso the sum of these can be expressed as the trace of ","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"sCVQr7BX9r"},{"type":"inlineMath","value":"\\sigma^2 P_{\\hi+1}","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"html":"σ2Ph+1\\sigma^2 P_{\\hi+1}σ2Ph+1","key":"X2HVBSjN4L"},{"type":"text","value":":","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"J4bqtufNQl"}],"key":"dEM4J4fOdE"},{"type":"math","value":"\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).","position":{"start":{"line":433,"column":1},"end":{"line":435,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).","enumerator":"2.18","key":"VasDgtmnEi"}],"key":"jGWsrxj7RP"},{"type":"paragraph","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"children":[{"type":"text","value":"Substituting this back into the expression for ","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"JS2WnwP6fB"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"iXdjOvj358"},{"type":"text","value":", we have:","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"UpCb8v425c"}],"key":"xIQACEFXNB"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":440,"column":1},"end":{"line":446,"column":1}},"html":"Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.","enumerator":"2.19","key":"dhPactxjmK"},{"type":"paragraph","position":{"start":{"line":448,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"text","value":"As we hoped, this expression is quadratic in ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"xMqULYpibq"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"x\\stx","key":"FCZVY9JpiM"},{"type":"text","value":" and ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"cWRccu2bq7"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"Ti8K7YsAco"},{"type":"text","value":".\nFurthermore,\nwe’d like to show that it also ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"wDQIRs3u9K"},{"type":"emphasis","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"curves upwards","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"dQvEWNrXuv"}],"key":"cC2wnVcBjv"},{"type":"text","value":"\nwith respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"kxaiq45uXq"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"PZjZbD37um"},{"type":"text","value":"\nso that its minimum with respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"PIujbxvAxo"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"o413UsD25L"},{"type":"text","value":" is well-defined.\nWe can do this by noting that the ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"GaOSHQcaTQ"},{"type":"strong","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"Hessian matrix","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"UnkTXFr6oW"}],"key":"KcEcQOm0ja"},{"type":"text","value":" of second derivatives is positive definite:","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"dILbI2lR0d"}],"key":"BPxqe4v3y4"},{"type":"math","value":"\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} B","position":{"start":{"line":455,"column":1},"end":{"line":457,"column":1}},"html":"uuQh(x,u)=R+BPh+1B\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} BuuQh(x,u)=R+BPh+1B","enumerator":"2.20","key":"y79HS0bmIT"},{"type":"paragraph","position":{"start":{"line":459,"column":1},"end":{"line":464,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"L70Y5W53iR"},{"type":"inlineMath","value":"R","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"RRR","key":"EkSBHy85mQ"},{"type":"text","value":" is s.p.d. (by ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"Xls7NuPxw8"},{"type":"crossReference","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"children":[{"type":"text","value":"the LQR definition","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"m7TFPbYbIk"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"yACH2kN3Jk"},{"type":"text","value":"),\nand ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"dLuwEM4dj1"},{"type":"inlineMath","value":"P_{\\hi+1}","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"Ph+1P_{\\hi+1}Ph+1","key":"IzvC47VpJQ"},{"type":"text","value":" is s.p.d. (by the inductive hypothesis),\nthis sum must also be s.p.d.,\nand so ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"lgA0MfTyHh"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"iAPBPHMQ3Q"},{"type":"text","value":" is indeed an upward-curved quadratic with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"vEhawTog7o"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"u\\actu","key":"UQPuTtEtHz"},{"type":"text","value":".\n(If this isn’t clear, try proving it as an exercise.)\nThe proof of its upward curvature with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"EFEoJDOcE0"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"x\\stx","key":"nBpFgtJ3wo"},{"type":"text","value":" is equivalent.","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"W7lhmEhbmA"}],"key":"VjnkDujIzJ"}],"enumerator":"2.1","key":"yTodstXcZp"},{"type":"proof","kind":"lemma","label":"lemma_pi_linear","identifier":"lemma_pi_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"bfLA1mozGR"},{"type":"text","value":" is linear","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"ACcxJuQKcE"}],"key":"KyBulOFr2m"},{"type":"paragraph","position":{"start":{"line":470,"column":1},"end":{"line":473,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"gbKhHFjTag"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"h6FL3vOERd"},{"type":"text","value":" is an upward-curved quadratic,\nfinding its minimum over ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"qhrrXGlRVE"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"wEmlRLnYOn"},{"type":"text","value":" is easy:\nwe simply set the gradient with respect to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"kRLrw6pVJI"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"cdNu64qTEH"},{"type":"text","value":" equal to zero and solve for ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"bu2patMAze"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"NWemoDUZHv"},{"type":"text","value":".\nFirst, we calculate the gradient:","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"Qf6WEEgSQ2"}],"key":"aZuObBIyo9"},{"type":"math","value":"\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}","position":{"start":{"line":475,"column":1},"end":{"line":480,"column":1}},"html":"uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)","enumerator":"2.21","key":"yTWTF1uCFt"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"Setting this to zero, we get","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"devWpRU35K"}],"key":"pdYghu3X16"},{"type":"math","value":"\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}","position":{"start":{"line":484,"column":1},"end":{"line":490,"column":1}},"html":"0=(R+BPh+1B)πh(x)+BPh+1Axπh(x)=(R+BPh+1B)1(BPh+1Ax)=Khx,\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}0πh(x)=(R+BPh+1B)πh(x)+BPh+1Ax=(R+BPh+1B)1(BPh+1Ax)=Khx,","enumerator":"2.22","key":"XkjPBTeJPc"},{"type":"paragraph","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"dH8p8vrXBI"}],"key":"r6DtiHW30d"},{"type":"math","value":"K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"identifier":"k_pi","label":"k_pi","html_id":"k-pi","html":"Kh=(R+BPh+1B)1BPh+1A.K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Kh=(R+BPh+1B)1BPh+1A.","enumerator":"2.23","key":"gM4uVuTJfj"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"text","value":"Note that this optimal policy doesn’t depend on the starting distribution ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"UXWMzPRIqM"},{"type":"inlineMath","value":"\\mu_0","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"μ0\\mu_0μ0","key":"FCyosztUjm"},{"type":"text","value":".\nIt’s also fully ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"U7H6RdJBvF"},{"type":"strong","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"rNz8m3FLEn"}],"key":"gXLmbPILHI"},{"type":"text","value":" and isn’t affected by the noise terms\n","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"F8MtlWM6Gr"},{"type":"inlineMath","value":"w_0, \\dots, w_{\\hor-1}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"w0,,wH1w_0, \\dots, w_{\\hor-1}w0,,wH1","key":"IWpTFYuZZB"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"lDxaxDZ9Iz"}],"key":"ShDiBIXQhH"}],"enumerator":"2.2","html_id":"lemma-pi-linear","key":"jADkfXTlai"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"F9CKqKMmJX"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"key":"GTixDZciJ0"}],"key":"yR3hiCERn4"},{"type":"paragraph","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"children":[{"type":"text","value":"Using the identity ","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"DE7UgoubBx"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"html":"Vh(x)=Qh(x,π(x))V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))Vh(x)=Qh(x,π(x))","key":"VOdjqKQHFi"},{"type":"text","value":", we have:","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"IzM6PaU43N"}],"key":"TyCp9iqJjy"},{"type":"math","value":"\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}","position":{"start":{"line":505,"column":1},"end":{"line":512,"column":1}},"html":"Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1","enumerator":"2.24","key":"QIpnhIypMP"},{"type":"paragraph","position":{"start":{"line":514,"column":1},"end":{"line":517,"column":1}},"children":[{"type":"text","value":"Note that with respect to ","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"TLJxw5htNQ"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"html":"x\\stx","key":"ZLuzXXcCuz"},{"type":"text","value":",\nthis is the sum of a quadratic term and a constant,\nwhich is exactly what we were aiming for!\nThe scalar term is clearly","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"Rk440vaIAt"}],"key":"t0f5rXEKWg"},{"type":"math","value":"p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"ph=Tr(σ2Ph+1)+ph+1.p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.ph=Tr(σ2Ph+1)+ph+1.","enumerator":"2.25","key":"MKl7oUdlVR"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"We can simplify the quadratic term by substituting in ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"hyE6aYEWEh"},{"type":"inlineMath","value":"K_\\hi","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"KhK_\\hiKh","key":"b9h2CChIi3"},{"type":"text","value":" from ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"kS1oIBtLoW"},{"type":"crossReference","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"(","key":"mcIoA9AEnR"},{"type":"text","value":"2.23","key":"rmhrckjpau"},{"type":"text","value":")","key":"lN8fP0hhYN"}],"identifier":"k_pi","label":"k_pi","kind":"equation","template":"(%s)","enumerator":"2.23","resolved":true,"html_id":"k-pi","key":"NjrLIpDgU9"},{"type":"text","value":".\nNotice that when we do this,\nthe ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"paPfU8SkvF"},{"type":"inlineMath","value":"(R+B^\\top P_{\\hi+1} B)","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"(R+BPh+1B)(R+B^\\top P_{\\hi+1} B)(R+BPh+1B)","key":"z5gnNXxc1F"},{"type":"text","value":" term in the expression is cancelled out by its inverse,\nand the remaining terms combine to give the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Vh5nOlvPDo"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"VoWEMG8DYh"}],"key":"q9xH4hrfJ4"},{"type":"text","value":":","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"XpynJItVmf"}],"key":"uvF1Q4PqYo"},{"type":"proof","kind":"definition","label":"riccati","identifier":"riccati","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"EI1IRSszb9"}],"key":"DCIyAzfgvH"},{"type":"math","value":"P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":529,"column":1},"end":{"line":531,"column":1}},"html":"Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.","enumerator":"2.26","key":"hLhcxtoYjo"}],"enumerator":"2.5","html_id":"riccati","key":"oKbcWKnv9A"},{"type":"paragraph","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"There are several nice properties to note about the Riccati equation:","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"pr0SUH0YkN"}],"key":"yNPwEhHpvM"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":536,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":536,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"It’s defined ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"zzAmDNP0in"},{"type":"strong","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"children":[{"type":"text","value":"recursively.","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"FyPVPdEIvp"}],"key":"MPXcWVJMWJ"},{"type":"text","value":"\nGiven the dynamics defined by ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"M2AC7NiLvi"},{"type":"inlineMath","value":"A","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"AAA","key":"UKAZAfcXT0"},{"type":"text","value":" and ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"vRKmyT0Onc"},{"type":"inlineMath","value":"B","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"BBB","key":"yTCuDkrtrh"},{"type":"text","value":", and the state cost matrix ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"KGd0O8F8vl"},{"type":"inlineMath","value":"Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"QQQ","key":"WQrzfW9DfR"},{"type":"text","value":",\nwe can recursively calculate ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"kOZE8yBNZc"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PhP_\\hiPh","key":"XwL9iQCVcV"},{"type":"text","value":" across all timesteps starting from ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"ZmNpxC0raK"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"vgNgBi0yqm"},{"type":"text","value":".","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"VaoA9GKSCF"}],"key":"vUVRiN4vm2"},{"type":"listItem","spread":true,"position":{"start":{"line":539,"column":1},"end":{"line":540,"column":1}},"children":[{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"PhP_\\hiPh","key":"UBQUUmJYvw"},{"type":"text","value":" often appears in calculations surrounding optimality,\nsuch as ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"Jx06QBSigN"},{"type":"inlineMath","value":"V^\\star_\\hi, Q^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"Vh,QhV^\\star_\\hi, Q^\\star_\\hiVh,Qh","key":"nyhpjFlAHj"},{"type":"text","value":", and ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"BhacFL20Hp"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"hEXv3fRzou"},{"type":"text","value":".","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"uxkpiV4scF"}],"key":"hbbxMpxzc6"},{"type":"listItem","spread":true,"position":{"start":{"line":541,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Together with the dynamics given by ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"k0rDEEM5HT"},{"type":"inlineMath","value":"A","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"AAA","key":"ACom5h9qbt"},{"type":"text","value":" and ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"ZNzAYpwQBO"},{"type":"inlineMath","value":"B","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"BBB","key":"dtKwXarz49"},{"type":"text","value":",\nand the action coefficients ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"vFC4XrEvrO"},{"type":"inlineMath","value":"R","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"RRR","key":"zJyiPHxZQ6"},{"type":"text","value":" in the lost function,\nit fully defines the optimal policy ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"Y4btzy1woU"},{"type":"crossReference","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"cu4S6Rlj7n"},{"type":"text","value":"2.2","key":"iykvc0CHKU"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"bMyc8zfeEY"},{"type":"text","value":".","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"an1PQRXA3G"}],"key":"RN83giiAom"}],"key":"APmFaQopKD"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"It remains to prove that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"Hqw9yek8Rl"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"VhV^\\star_\\hiVh","key":"OnM6getMsr"},{"type":"text","value":" ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"xOPc2xf0bz"},{"type":"emphasis","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"curves upwards,","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"OwGn85VM3V"}],"key":"kkAow2u3il"},{"type":"text","value":" that is, that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"byI0ByVWip"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"PhP_\\hiPh","key":"Zew0doQuAp"},{"type":"text","value":" is s.p.d. We will use the following fact about ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"pGpJLzOROE"},{"type":"strong","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"Schur complements:","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"qr35lGx5a7"}],"key":"KGhOcITB9x"}],"key":"y1Djfx2UqZ"},{"type":"proof","kind":"lemma","label":"lemma_schur","identifier":"lemma_schur","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Positive definiteness of Schur complements","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"RK0DomjBvZ"}],"key":"FpFQkVoZjm"},{"type":"paragraph","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"Let","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"mDr7SzDvxP"}],"key":"APX71Dk5go"},{"type":"math","value":"D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}","position":{"start":{"line":552,"column":1},"end":{"line":557,"column":1}},"html":"D=(ABBC)D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}D=(ABBC)","enumerator":"2.27","key":"uWbCzTf5wF"},{"type":"paragraph","position":{"start":{"line":559,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"be a symmetric ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"Hbs6hyu1sn"},{"type":"inlineMath","value":"(m+n) \\times (m+n)","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"(m+n)×(m+n)(m+n) \\times (m+n)(m+n)×(m+n)","key":"xk0RBUnhqZ"},{"type":"text","value":" block matrix,\nwhere ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"yzHeqdTkkA"},{"type":"inlineMath","value":"A \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"ARm×m,BRm×n,CRn×nA \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}ARm×m,BRm×n,CRn×n","key":"GYPlRpTerf"},{"type":"text","value":".\nThe ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"ozrQtIzyzj"},{"type":"strong","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"children":[{"type":"text","value":"Schur complement","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"SyO9wF30M5"}],"key":"F0sKFZlmnb"},{"type":"text","value":" of ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"RoqK3LD9FB"},{"type":"inlineMath","value":"A","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"AAA","key":"Ajca5pDKOk"},{"type":"text","value":" is denoted","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"xYqEZ5dY0K"}],"key":"NPvzXKLmiD"},{"type":"math","value":"D/A = C - B^\\top A^{-1} B.","position":{"start":{"line":563,"column":1},"end":{"line":565,"column":1}},"html":"D/A=CBA1B.D/A = C - B^\\top A^{-1} B.D/A=CBA1B.","enumerator":"2.28","key":"t7imcoXHIw"},{"type":"paragraph","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"children":[{"type":"text","value":"Schur complements have various uses in linear algebra and numerical computation.","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"key":"vEVYpO9Y6K"}],"key":"gtY5xEhhFK"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":572,"column":1}},"children":[{"type":"text","value":"A useful fact for us is that\nif ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"QdjeMqnje4"},{"type":"inlineMath","value":"A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"AAA","key":"vHx5qvsExi"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"dH9Ezt8pxV"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"definite,","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"cYEBkAWlrV"}],"key":"fI0qLRi4oI"},{"type":"text","value":"\nthen ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"GVfAGwyNhd"},{"type":"inlineMath","value":"D","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"DDD","key":"lRTMpBYDHE"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"mLIeLqil2p"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"MOiHN26JEc"}],"key":"WPNbbY5oQK"},{"type":"text","value":"\nif and only if ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"ioBgxFWl2x"},{"type":"inlineMath","value":"D/A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"D/AD/AD/A","key":"QYvF0QvIdL"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"xjVpAAJ7W9"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"YWAuSey1A3"}],"key":"ADYDHR9wRg"},{"type":"text","value":".","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"B236vQBoux"}],"key":"WLhSQcbk1y"}],"enumerator":"2.4","html_id":"lemma-schur","key":"ubhMxIwdwv"},{"type":"paragraph","position":{"start":{"line":575,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"y00aXMU42X"},{"type":"inlineMath","value":"P","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"PPP","key":"VdeCMP4Sg2"},{"type":"text","value":" denote ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"JGNbmt67kz"},{"type":"inlineMath","value":"P_{\\hi + 1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"Ph+1P_{\\hi + 1}Ph+1","key":"uLy9R2vJSE"},{"type":"text","value":" for brevity.\nWe already know ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"nM7Wbs8RbE"},{"type":"inlineMath","value":"Q","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"QQQ","key":"ivd9lNujhb"},{"type":"text","value":" is p.d.,\nso it suffices to show that","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"D24jGMiNdJ"}],"key":"CCF41AGMW8"},{"type":"math","value":"S = P - P B (R + B^\\top P B)^{-1} B^\\top P","position":{"start":{"line":579,"column":1},"end":{"line":581,"column":1}},"html":"S=PPB(R+BPB)1BPS = P - P B (R + B^\\top P B)^{-1} B^\\top PS=PPB(R+BPB)1BP","enumerator":"2.29","key":"hgsqhkEKQ0"},{"type":"paragraph","position":{"start":{"line":583,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"is p.s.d. (positive semidefinite),\nsince left- and right- multiplying by ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"N7VMnMor14"},{"type":"inlineMath","value":"A^\\top","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AA^\\topA","key":"Hu0xC8x5vO"},{"type":"text","value":" and ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"V9DYuVwK9c"},{"type":"inlineMath","value":"A","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AAA","key":"KtW1mk1TpQ"},{"type":"text","value":" respectively\npreserves p.s.d.\nWe note that ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"FzSEWafppv"},{"type":"inlineMath","value":"S","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"SSS","key":"ES4uuflqCv"},{"type":"text","value":" is the Schur complement ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"AdBMkKSenk"},{"type":"inlineMath","value":"D/(R + B^\\top P B)","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"D/(R+BPB)D/(R + B^\\top P B)D/(R+BPB)","key":"NwBnluUxft"},{"type":"text","value":", where","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"ijdfT3xMt9"}],"key":"lxB8ZINHQ5"},{"type":"math","value":"D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.","position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"html":"D=(R+BPBBPPBP).D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.D=(R+BPBPBBPP).","enumerator":"2.30","key":"fR2mohcx9r"},{"type":"paragraph","position":{"start":{"line":595,"column":1},"end":{"line":596,"column":1}},"children":[{"type":"text","value":"Thus we must show that ","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"ttd8djhn5f"},{"type":"inlineMath","value":"D","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"html":"DDD","key":"bb5ZvooIMi"},{"type":"text","value":" is p.s.d..\nThis can be seen by computing","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"l4ENVwZYef"}],"key":"VdwhGLjc4r"},{"type":"math","value":"\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}","position":{"start":{"line":598,"column":1},"end":{"line":611,"column":1}},"html":"(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.","enumerator":"2.31","key":"VN91QvIg7S"},{"type":"paragraph","position":{"start":{"line":613,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"BNRU8boPoU"},{"type":"inlineMath","value":"R + B^\\top P B","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"R+BPBR + B^\\top P BR+BPB","key":"qraYTRbCL0"},{"type":"text","value":" is p.d. and ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"l0Diu8DTaG"},{"type":"inlineMath","value":"D","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"DDD","key":"pbVq5Isj6S"},{"type":"text","value":" is p.s.d.,\nthen ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"zTHxeAPLzU"},{"type":"inlineMath","value":"S = D / (R + B^\\top P B)","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"S=D/(R+BPB)S = D / (R + B^\\top P B)S=D/(R+BPB)","key":"UI8qNOTSTS"},{"type":"text","value":" must be p.s.d.,\nand ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"DnLkAiAzYl"},{"type":"inlineMath","value":"P_\\hi = Q + A S A^\\top","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"Ph=Q+ASAP_\\hi = Q + A S A^\\topPh=Q+ASA","key":"bkgLOHwiB4"},{"type":"text","value":" must be p.d.","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"kGFg3S3pOd"}],"key":"PrW0wnXTIf"}],"enumerator":"2.3","key":"vcYO0yCZlt"},{"type":"paragraph","position":{"start":{"line":618,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"Now we’ve shown that ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"PXDjHCDLx3"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","key":"kPWo6j66Ry"},{"type":"text","value":",\nwhere ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"uyZPdlShro"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"PhP_\\hiPh","key":"IZmllTROIf"},{"type":"text","value":" is s.p.d.,\nproving the inductive hypothesis and completing the proof of ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"bqJ3zyKyH2"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"QLFTqwPwDE"},{"type":"text","value":"2.2","key":"oCocGcbFJ9"}],"identifier":"optimal_policy_lqr_linear","label":"optimal_policy_lqr_linear","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.2","resolved":true,"html_id":"optimal-policy-lqr-linear","key":"ACMapB7p9c"},{"type":"text","value":" and ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"oaEEXDCYWJ"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"ROYREPmC2T"},{"type":"text","value":"2.1","key":"cbhu4yYk5s"}],"identifier":"optimal_value_lqr_quadratic","label":"optimal_value_lqr_quadratic","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.1","resolved":true,"html_id":"optimal-value-lqr-quadratic","key":"AjCoV07Wi0"},{"type":"text","value":".","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"zEn2L9Wmnq"}],"key":"SXGC4oZWcw"},{"type":"paragraph","position":{"start":{"line":622,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"In summary, we just demonstrated that at each timestep ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"Q81vbIYLi3"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"R5iQQF9oDm"},{"type":"text","value":",\nthe optimal value function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"J8fSwIQRZw"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"VhV^\\star_\\hiVh","key":"NrVHDlc3oK"},{"type":"text","value":"\nand optimal Q-function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"DFq48eIHkl"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"y5SSvioWrG"},{"type":"text","value":" are both upward-curved quadratics\nand the optimal policy ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"e221cMhk53"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"GGRJmM3p3N"},{"type":"text","value":" is linear.\nWe also showed that all of these quantities can be calculated\nusing a sequence of s.p.d. matrices ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"R0GO4vCWIQ"},{"type":"inlineMath","value":"P_0, \\dots, P_H","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"P0,,PHP_0, \\dots, P_HP0,,PH","key":"E4i4XzClsE"},{"type":"text","value":"\nthat can be defined recursively using the Riccati equation ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"MSVGolEp7K"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"sGicoN829C"},{"type":"text","value":"2.5","key":"PLZMtsOgKS"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"O7bz0EmGyj"},{"type":"text","value":".","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"Bgj2DFgokv"}],"key":"bMaGXuNmSR"},{"type":"paragraph","position":{"start":{"line":630,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"Before we move on to some extensions of LQR, let’s consider how the\nstate at time ","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"VXz6vt12Ux"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"html":"h\\hih","key":"qVgmX2y6m3"},{"type":"text","value":" behaves when we act according to this optimal\npolicy.","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"svEZPZ4zaJ"}],"key":"k7l5LsZ2Bn"},{"type":"heading","depth":3,"position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"Expected state at time ","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"qalaLqVOS5"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"html":"h\\hih","key":"oPKEnzhiyc"}],"identifier":"expected-state-at-time-hi","label":"Expected state at time \\hi","html_id":"expected-state-at-time-hi","implicit":true,"enumerator":"2.4.1","key":"Xqr2EJGCK3"},{"type":"paragraph","position":{"start":{"line":636,"column":1},"end":{"line":639,"column":1}},"children":[{"type":"text","value":"How can we compute the expected state at time ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"quuFjwqoqx"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"h\\hih","key":"QC5AOlrWn2"},{"type":"text","value":" when acting\naccording to the optimal policy? Let’s first express ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"FMYU1SJNy2"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"xh\\st_\\hixh","key":"wnTR0AcsAc"},{"type":"text","value":" in a\ncleaner way in terms of the history. Note that having linear dynamics\nmakes it easy to expand terms backwards in time:","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"ySIDw4GwUx"}],"key":"lIiMXAgX2f"},{"type":"math","value":"\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}","position":{"start":{"line":641,"column":1},"end":{"line":648,"column":1}},"html":"xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).","enumerator":"2.32","key":"IDRbyj9x4V"},{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":655,"column":1}},"children":[{"type":"text","value":"Let’s consider the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"NjHbENYP78"},{"type":"emphasis","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"children":[{"type":"text","value":"average state","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"FWuCe4dq4D"}],"key":"fR8b8gJzMW"},{"type":"text","value":" at this time, given all the past\nstates and actions. Since we assume that ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"rjBqPc8k7X"},{"type":"inlineMath","value":"\\E [w_\\hi] = 0","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"E[wh]=0\\E [w_\\hi] = 0E[wh]=0","key":"x8G04GSsro"},{"type":"text","value":" (this is the\nzero vector in ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"k1vAMk7eK7"},{"type":"inlineMath","value":"d","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"ddd","key":"qDKvah0Xyf"},{"type":"text","value":" dimensions), when we take an expectation, the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"QROeADbISj"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"whw_\\hiwh","key":"Nz5nb0pQe0"},{"type":"text","value":"\nterm vanishes due to linearity, and so we’re left with","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"VfhWZY4dsY"}],"key":"w2ssNDxsGu"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.","position":{"start":{"line":658,"column":1},"end":{"line":661,"column":1}},"identifier":"expected_state","label":"expected_state","html_id":"expected-state","html":"E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.","enumerator":"2.33","key":"v7HEVLEQ7e"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"ykKBBaA75s"}],"key":"Beq1YhrKtu"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Show that if we choose actions according to the optimal policy ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"BGK8NR140S"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"PQ4Zd7pKNF"},{"type":"text","value":"2.2","key":"d2B73f2nLx"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"QXBeKsCQFX"},{"type":"text","value":", ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"oGtbYLA2JI"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"(","key":"gxB66FDUeH"},{"type":"text","value":"2.33","key":"UDGfnvKbe1"},{"type":"text","value":")","key":"ldMYg3Dij9"}],"identifier":"expected_state","label":"expected_state","kind":"equation","template":"(%s)","enumerator":"2.33","resolved":true,"html_id":"expected-state","key":"rj7jhSwSK7"},{"type":"text","value":" becomes","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"oplayaP1EF"}],"key":"gzYrej4UrO"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.","position":{"start":{"line":667,"column":1},"end":{"line":669,"column":1}},"html":"E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.","enumerator":"2.34","key":"upLgb8hPdo"}],"key":"X8yXVjASIl"},{"type":"paragraph","position":{"start":{"line":672,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"This introdces the quantity ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"W3TVWipkSf"},{"type":"inlineMath","value":"A - B K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKiA - B K_iABKi","key":"f3nVe551P1"},{"type":"text","value":", which shows up frequently in\ncontrol theory. For example, one important question is: will ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"ScbBKMMXD1"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xh\\st_\\hixh","key":"V0jX2Dg1ve"},{"type":"text","value":"\nremain bounded, or will it go to infinity as time goes on? To answer\nthis, let’s imagine for simplicity that these ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"kRt01sTshC"},{"type":"inlineMath","value":"K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KiK_iKi","key":"W438dcLqBb"},{"type":"text","value":"s are equal (call\nthis matrix ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"f9sQiaITbB"},{"type":"inlineMath","value":"K","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KKK","key":"I0GpIYVrOB"},{"type":"text","value":"). Then the expression above becomes ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"xjGQiN99sm"},{"type":"inlineMath","value":"(A-BK)^\\hi \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"(ABK)hx0(A-BK)^\\hi \\st_0(ABK)hx0","key":"yC4SKV63JZ"},{"type":"text","value":".\nNow consider the maximum eigenvalue ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"FmEgrRg51A"},{"type":"inlineMath","value":"\\lambda_{\\max}","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax\\lambda_{\\max}λmax","key":"TmI80Ibecy"},{"type":"text","value":" of ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"BLsFqfLxMv"},{"type":"inlineMath","value":"A - BK","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKA - BKABK","key":"naSYwQihAw"},{"type":"text","value":". If\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"c6NL9MM59J"},{"type":"inlineMath","value":"|\\lambda_{\\max}| > 1","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax>1|\\lambda_{\\max}| > 1λmax>1","key":"urojAVYLid"},{"type":"text","value":", then there’s some nonzero initial state\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"y0iR3sM9W9"},{"type":"inlineMath","value":"\\bar \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xˉ0\\bar \\st_0xˉ0","key":"p7a1gaqO4j"},{"type":"text","value":", the corresponding eigenvector, for which","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"vmcNdP8SPR"}],"key":"p1CJvpAFjG"},{"type":"math","value":"\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.","position":{"start":{"line":682,"column":1},"end":{"line":686,"column":1}},"html":"limh(ABK)hxˉ0=limhλmaxhxˉ0=.\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.hlim(ABK)hxˉ0=hlimλmaxhxˉ0=∞.","enumerator":"2.35","key":"pP9RR94ZAx"},{"type":"paragraph","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"children":[{"type":"text","value":"Otherwise, if ","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"rl1OWa3lGF"},{"type":"inlineMath","value":"|\\lambda_{\\max}| < 1","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"html":"λmax<1|\\lambda_{\\max}| < 1λmax<1","key":"oXpxAlfKnx"},{"type":"text","value":", then it’s impossible for your original state to explode as dramatically.","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"iaN0g53E5k"}],"key":"DNpFOiSOHR"},{"type":"heading","depth":2,"position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"children":[{"type":"text","value":"Extensions","position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"key":"TB6wXBfclX"}],"identifier":"extensions","label":"Extensions","html_id":"extensions","implicit":true,"enumerator":"2.5","key":"eEqUotGexJ"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":697,"column":1}},"children":[{"type":"text","value":"We’ve now formulated an optimal solution for the time-homogeneous LQR\nand computed the expected state under the optimal policy. However, real\nworld tasks rarely have such simple dynamics, and we may wish to design\nmore complex cost functions. In this section, we’ll consider more\ngeneral extensions of LQR where some of the assumptions we made above\nare relaxed. Specifically, we’ll consider:","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"mWrIQ7ej6g"}],"key":"UOBb8S8ZY6"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":699,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":699,"column":1},"end":{"line":701,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":699,"column":1},"end":{"line":700,"column":1}},"children":[{"type":"strong","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"children":[{"type":"text","value":"Time-dependency","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"rfrvKvaNjw"}],"key":"aSKxgCCZJo"},{"type":"text","value":", where the dynamics and cost function might\nchange depending on the timestep.","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"QVxIiazkcC"}],"key":"llNaJqeSSZ"}],"key":"M3RE2vvrXe"},{"type":"listItem","spread":true,"position":{"start":{"line":702,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":702,"column":1},"end":{"line":703,"column":1}},"children":[{"type":"strong","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"General quadratic cost","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"J3w8uli7YB"}],"key":"BCd2yidVxv"},{"type":"text","value":", where we allow for linear terms and a\nconstant term.","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"PUALcQPCde"}],"key":"G8K3PpfUOr"}],"key":"vAL8Wg6NGE"},{"type":"listItem","spread":true,"position":{"start":{"line":705,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":705,"column":1},"end":{"line":706,"column":1}},"children":[{"type":"strong","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"Tracking a goal trajectory","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"Imfq3zMeKX"}],"key":"wRMDo8d6Bi"},{"type":"text","value":" rather than aiming for a single goal\nstate-action pair.","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"P5zu53azCS"}],"key":"hrMkVWkU4M"}],"key":"Y4rMZmX1n3"}],"key":"H4j4ZQy61r"},{"type":"paragraph","position":{"start":{"line":708,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"Combining these will allow us to use the LQR solution to solve more\ncomplex setups by taking ","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"mYikLmyN3u"},{"type":"emphasis","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"ofNSbeLowY"}],"key":"KP3tC9yTBo"},{"type":"text","value":" of the dynamics and\ncost functions.","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"ZUU8mObgVl"}],"key":"rA24YYCcUd"},{"type":"heading","depth":3,"position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"children":[{"type":"text","value":"Time-dependent dynamics and cost function","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"LrDV196PRQ"}],"label":"time_dep_lqr","identifier":"time_dep_lqr","html_id":"time-dep-lqr","enumerator":"2.5.1","key":"VSXgAKw3mc"},{"type":"paragraph","position":{"start":{"line":715,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"So far, we’ve considered the ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"nRIMfcQdJD"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"nORk3NVb8n"}],"key":"Y4E2r33tho"},{"type":"text","value":" case, where the dynamics\nand cost function stay the same at every timestep. However, this might\nnot always be the case. As an example, in many sports, the rules and\nscoring system might change during an overtime period. To address these\nsorts of problems, we can loosen the time-homogeneous restriction, and\nconsider the case where the dynamics and cost function are\n","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"PU7BiUI9vD"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-dependent.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"oXKCDILq1E"}],"key":"VtdpBojQ6k"},{"type":"text","value":" Our analysis remains almost identical; in fact, we can\nsimply add a time index to the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"wJxBxYcHfz"},{"type":"inlineMath","value":"A","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"AAA","key":"l56ZqBsu9z"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"nPapqH5qpv"},{"type":"inlineMath","value":"B","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"BBB","key":"fF1u8Lpvbm"},{"type":"text","value":" that determine the\ndynamics and the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"n5QrdemqRO"},{"type":"inlineMath","value":"Q","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"QQQ","key":"A6rGFlKYUl"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"OfsxOmbYcQ"},{"type":"inlineMath","value":"R","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"RRR","key":"wZe6WZmpaC"},{"type":"text","value":" that determine the cost.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"DksXlYNyHu"}],"key":"UYE9fDI17u"},{"type":"paragraph","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"children":[{"type":"text","value":"The modified problem is now defined as follows:","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"key":"LKqyeLt5QL"}],"key":"qQdkQGm2gn"},{"type":"proof","kind":"definition","label":"time_dependent_lqr","identifier":"time_dependent_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent LQR","position":{"start":{"line":727,"column":1},"end":{"line":727,"column":1}},"key":"KMnGb2pTXd"}],"key":"obw9EH75pw"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":730,"column":1},"end":{"line":738,"column":1}},"html":"minπ0,,πH1E[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]wherexh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}π0,,πH1minwhereE[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).","enumerator":"2.36","key":"DwiC4CI7tQ"}],"enumerator":"2.6","html_id":"time-dependent-lqr","key":"qbgGYJygXu"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"The derivation of the optimal value functions and the optimal policy\nremains almost exactly the same, and we can modify the Riccati equation\naccordingly:","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"JLxsl8NjWw"}],"key":"xvMnv40fBY"},{"type":"proof","kind":"definition","label":"riccati_time_dependent","identifier":"riccati_time_dependent","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent Riccati Equation","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"key":"vWwrY0S5hk"}],"key":"mErsluC0WT"},{"type":"math","value":"P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.","position":{"start":{"line":750,"column":1},"end":{"line":752,"column":1}},"html":"Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.","enumerator":"2.37","key":"Z9YsIPYRE2"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":756,"column":1}},"children":[{"type":"text","value":"Note that this is just the time-homogeneous Riccati equation\n(","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"fVxNFB6HSd"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"coAqZQCa4c"},{"type":"text","value":"2.5","key":"pc3ZFrk12P"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"KWedAnZSDn"},{"type":"text","value":"), but with the time index added to each of the\nrelevant matrices.","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"jl2ENs6hIe"}],"key":"XNa5jeU6Ul"}],"enumerator":"2.7","html_id":"riccati-time-dependent","key":"pYl2TYgJ3i"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":759,"column":1},"end":{"line":759,"column":1}},"key":"QWbM3AltND"}],"key":"rPItt5fq5g"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Walk through the proof in ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"eRCl0WHc9z"},{"type":"crossReference","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Section ","key":"MFJZkrLnaP"},{"type":"text","value":"2.4","key":"sMunxzGgRg"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"PW5uZqXdww"},{"type":"text","value":" to verify that we can simply add ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"O1yYrhDH52"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"h\\hih","key":"VPeCG4Af8B"},{"type":"text","value":" for the time-dependent case.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"KruXto9ba5"}],"key":"oNnBPIwhlI"}],"key":"imdLIt4t7u"},{"type":"paragraph","position":{"start":{"line":763,"column":1},"end":{"line":765,"column":1}},"children":[{"type":"text","value":"Additionally, by allowing the dynamics to vary across time, we gain the\nability to ","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"Sj7cTIzv0n"},{"type":"emphasis","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"AvbPKTSP7X"}],"key":"ATFVF9va7P"},{"type":"text","value":" nonlinear dynamics at each timestep.\nWe’ll discuss this later in the chapter.","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"D9AmPOT82P"}],"key":"M8ZbxrM5tH"},{"type":"heading","depth":3,"position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"More general quadratic cost functions","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"fV5rGYuMKH"}],"identifier":"more-general-quadratic-cost-functions","label":"More general quadratic cost functions","html_id":"more-general-quadratic-cost-functions","implicit":true,"enumerator":"2.5.2","key":"KpVsRBcXfF"},{"type":"paragraph","position":{"start":{"line":769,"column":1},"end":{"line":776,"column":1}},"children":[{"type":"text","value":"Our original cost function had only second-order terms with respect to\nthe state and action, incentivizing staying as close as possible to\n","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"ImnFGFZ8nu"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star) = (0, 0)","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"(x,u)=(0,0)(\\st^\\star, \\act^\\star) = (0, 0)(x,u)=(0,0)","key":"iROkgoLn7f"},{"type":"text","value":". We can also consider more general\nquadratic cost functions that also have first-order terms and a constant\nterm. Combining this with time-dependent dynamics results in the\nfollowing expression, where we introduce a new matrix ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"Waz9EugVyY"},{"type":"inlineMath","value":"M_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"MhM_\\hiMh","key":"xjMuSsisAK"},{"type":"text","value":" for the\ncross term, linear coefficients ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"yZxFsjY19Z"},{"type":"inlineMath","value":"q_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"qhq_\\hiqh","key":"R0JR1twHsK"},{"type":"text","value":" and ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"qSD1jPSSgn"},{"type":"inlineMath","value":"r_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"rhr_\\hirh","key":"pUkKbJOcOO"},{"type":"text","value":" for the state and\naction respectively, and a constant term ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"Sjy2s6d8Rv"},{"type":"inlineMath","value":"c_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"chc_\\hich","key":"nY6mJPvqh8"},{"type":"text","value":":","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"obbJ6G0e3O"}],"key":"RSdlNkedo4"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.","label":"general_quadratic_cost","identifier":"general_quadratic_cost","html":"ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.","enumerator":"2.38","html_id":"general-quadratic-cost","key":"m2QZCfxQFf"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"Similarly, we can also include a\nconstant term ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"G2g3mjxeL4"},{"type":"inlineMath","value":"v_\\hi \\in \\mathbb{R}^{n_\\st}","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"vhRnxv_\\hi \\in \\mathbb{R}^{n_\\st}vhRnx","key":"XV5AIFxYim"},{"type":"text","value":" in the dynamics (note that this is\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"J4MYWkEt50"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"PDxfXExNsG"}],"key":"sgJGkDJExi"},{"type":"text","value":" at each timestep, unlike the stochastic noise ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"rTcPf4bT8x"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"whw_\\hiwh","key":"idWxsnhxrI"},{"type":"text","value":"):","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"VZlUX7sI4o"}],"key":"PIa0zIH5M7"},{"type":"math","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.","position":{"start":{"line":789,"column":1},"end":{"line":791,"column":1}},"html":"xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.","enumerator":"2.39","key":"p0Rkz8GdrH"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"exercise","position":{"start":{"line":795,"column":1},"end":{"line":795,"column":1}},"key":"Z2aS58ipOm"}],"key":"gaM0NoHpus"},{"type":"paragraph","position":{"start":{"line":796,"column":1},"end":{"line":797,"column":1}},"children":[{"type":"text","value":"Derive the optimal solution. You will need to slightly modify the\nproof in ","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"cJvy7ZaE4C"},{"type":"crossReference","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"Section ","key":"WWmIisqBbq"},{"type":"text","value":"2.4","key":"nwVGFDpTQD"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"LZsnEXzUf9"},{"type":"text","value":".","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"gAIWzSbymR"}],"key":"GoN9xZAN6F"}],"key":"GVD7L0YDtl"},{"type":"heading","depth":3,"position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"text","value":"Tracking a predefined trajectory","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"SXcvjnPVYL"}],"identifier":"tracking-a-predefined-trajectory","label":"Tracking a predefined trajectory","html_id":"tracking-a-predefined-trajectory","implicit":true,"enumerator":"2.5.3","key":"MSsWQN4y4R"},{"type":"paragraph","position":{"start":{"line":802,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Consider applying LQR to a task like autonomous driving, where the\ntarget state-action pair changes over time. We might want the vehicle to\nfollow a predefined ","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"NXnoTMqGgw"},{"type":"emphasis","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"vPUVJNNcmY"}],"key":"jP6VHt61Dn"},{"type":"text","value":" of states and actions\n","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"T7m3meq1gO"},{"type":"inlineMath","value":"(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"html":"(xh,uh)h=0H1(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}(xh,uh)h=0H1","key":"wloSzcZ18q"},{"type":"text","value":". To express this as a\ncontrol problem, we’ll need a corresponding time-dependent cost\nfunction:","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"FclYptdt47"}],"key":"xq4NUHPyEk"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).","position":{"start":{"line":810,"column":1},"end":{"line":812,"column":1}},"html":"ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).","enumerator":"2.40","key":"NyRgnN37yu"},{"type":"paragraph","position":{"start":{"line":815,"column":1},"end":{"line":818,"column":1}},"children":[{"type":"text","value":"Note that this punishes states and actions that are far from the\nintended trajectory. By expanding out these multiplications, we can see\nthat this is actually a special case of the more general quadratic cost\nfunction above ","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"t0nFZVpPzO"},{"type":"crossReference","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"children":[{"type":"text","value":"(","key":"fPXhFAYir0"},{"type":"text","value":"2.38","key":"MiIOsFKU1r"},{"type":"text","value":")","key":"TX1xRdwys8"}],"identifier":"general_quadratic_cost","label":"general_quadratic_cost","kind":"equation","template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"vZgBnDH94q"},{"type":"text","value":":","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"Mz0QBJakVz"}],"key":"uIV32qx360"},{"type":"math","value":"M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).","position":{"start":{"line":821,"column":1},"end":{"line":823,"column":1}},"html":"Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).","enumerator":"2.41","key":"bzoDTKVEkz"},{"type":"heading","depth":2,"position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Approximating nonlinear dynamics","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"UD1kflXPr5"}],"label":"approx_nonlinear","identifier":"approx_nonlinear","html_id":"approx-nonlinear","enumerator":"2.6","key":"Xab9DsQkNw"},{"type":"paragraph","position":{"start":{"line":830,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"The LQR algorithm solves for the optimal policy when the dynamics are\n","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"j9c6ApPEpt"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"vgGgoZrBMY"}],"key":"i0K9rGAnUa"},{"type":"text","value":" and the cost function is an ","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"n9CipPEm9d"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"r1KsMpQRUX"}],"key":"pyvXODzHyC"},{"type":"text","value":". However,\nreal settings are rarely this simple! Let’s return to the CartPole\nexample from the start of the chapter\n(","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"nKqfI8ioRG"},{"type":"crossReference","kind":"proof:example","identifier":"cart_pole","label":"cart_pole","children":[{"type":"text","value":"Example ","key":"RnpmO2Hn3M"},{"type":"text","value":"2.1","key":"FihiPqFdoK"}],"template":"Example %s","enumerator":"2.1","resolved":true,"html_id":"cart-pole","key":"aisqFnKxhO"},{"type":"text","value":"). The dynamics (physics) aren’t linear. How\ncan we approximate this by an LQR problem?","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"oRrmfcoFzo"}],"key":"qZwXHYYEZ0"},{"type":"paragraph","position":{"start":{"line":837,"column":1},"end":{"line":840,"column":1}},"children":[{"type":"text","value":"Concretely, let’s consider a ","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"jwqhbc6aYi"},{"type":"emphasis","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"children":[{"type":"text","value":"noise-free","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"Tt3WAsRfCo"}],"key":"LMbl90FQFB"},{"type":"text","value":" problem since, as we saw, the\nnoise doesn’t factor into the optimal policy. Let’s assume the dynamics\nand cost function are stationary, and ignore the terminal state for\nsimplicity:","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"TmE6U5l7IE"}],"key":"QkdRWfrYys"},{"type":"proof","kind":"definition","label":"nonlinear_control","identifier":"nonlinear_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Nonlinear control problem","position":{"start":{"line":842,"column":1},"end":{"line":842,"column":1}},"key":"puOi7uMT3C"}],"key":"HCh4lIzatx"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}","position":{"start":{"line":847,"column":1},"end":{"line":855,"column":1}},"html":"minπ0,,πH1:SAEx0[h=0H1c(xh,uh)]wherexh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}π0,,πH1:SAminwhereEx0[h=0H1c(xh,uh)]xh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).","enumerator":"2.42","key":"OdqaWXTwEg"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":858,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"H6kSc1raUm"},{"type":"inlineMath","value":"d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"ddd","key":"cHw8Tvx1um"},{"type":"text","value":" denotes a function that measures the\n“distance” between its two arguments.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"gAHUa03Xak"}],"key":"Ofy1ijApqE"}],"enumerator":"2.8","html_id":"nonlinear-control","key":"kce4bs83bR"},{"type":"paragraph","position":{"start":{"line":861,"column":1},"end":{"line":871,"column":1}},"children":[{"type":"text","value":"This is now only slightly simplified from the general optimal control\nproblem (see\n","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"PsyCGdE0xj"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"mmJOoMFVCz"},{"type":"text","value":"2.1","key":"PG7Flyn59P"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"stDvL6CWWg"},{"type":"text","value":"). Here, we don’t know an analytical form\nfor the dynamics ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"FQLgt8eqpc"},{"type":"inlineMath","value":"f","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"fff","key":"bA9Ikdbika"},{"type":"text","value":" or the cost function ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"BRSExWULc9"},{"type":"inlineMath","value":"c","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"ccc","key":"tRck6celdd"},{"type":"text","value":", but we assume that we’re\nable to ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"CpGM399faI"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"query/sample/simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"FQ55BzmLC5"}],"key":"ZAsKHj1Mzu"},{"type":"text","value":" them to get their values at a given\nstate and action. To clarify, consider the case where the dynamics are\ngiven by real world physics. We can’t (yet) write down an expression for\nthe dynamics that we can differentiate or integrate analytically.\nHowever, we can still ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"aP4M4JTbzM"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"zgJrKcFG2h"}],"key":"sc2ATK2TxM"},{"type":"text","value":" the dynamics and cost function by\nrunning a real-world experiment and measuring the resulting states and\ncosts. How can we adapt LQR to this more general nonlinear case?","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"xXfMvqh4SL"}],"key":"QyDM9ue2dH"},{"type":"heading","depth":3,"position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"children":[{"type":"text","value":"Local linearization","position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"key":"nWkr8wVLQ0"}],"identifier":"local-linearization","label":"Local linearization","html_id":"local-linearization","implicit":true,"enumerator":"2.6.1","key":"kjdLetXABc"},{"type":"paragraph","position":{"start":{"line":875,"column":1},"end":{"line":883,"column":1}},"children":[{"type":"text","value":"How can we apply LQR when the dynamics are nonlinear or the cost\nfunction is more complex? We’ll exploit the useful fact that we can take\na function that’s ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"dODafYNgvJ"},{"type":"emphasis","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"locally continuous","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"prHtUfUPR4"}],"key":"omBmuqbzrb"},{"type":"text","value":" around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"FRBA3Weg3M"},{"type":"inlineMath","value":"(s^\\star, a^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(s,a)(s^\\star, a^\\star)(s,a)","key":"VdH3hfV3b7"},{"type":"text","value":" and\napproximate it nearby with low-order polynomials (i.e. its Taylor\napproximation). In particular, as long as the dynamics ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"xhUFrsYeKo"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"befc1vTRhp"},{"type":"text","value":" are\ndifferentiable around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"gzqVUUVkE9"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"tFqZh3p5Lc"},{"type":"text","value":" and the cost function\n","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"RyHWvz6Xu6"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"eWLFBQatAh"},{"type":"text","value":" is twice differentiable at ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"EqsZgbOTKn"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"pYfpdb8Z42"},{"type":"text","value":", we can take a\nlinear approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"Vlt83kyUYW"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"FfkwuWIyD8"},{"type":"text","value":" and a quadratic approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"qPtx0U2NEo"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"JRXTJG8vqA"},{"type":"text","value":" to\nbring us back to the regime of LQR.","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"yusJ7SKusB"}],"key":"onEqFVxsEh"},{"type":"paragraph","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"children":[{"type":"text","value":"Linearizing the dynamics around ","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"keUasQ0rzp"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"Mgh2wVmvYw"},{"type":"text","value":" gives:","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"iA0q3qtSbX"}],"key":"briyFsWuts"},{"type":"math","value":"\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}","position":{"start":{"line":888,"column":1},"end":{"line":893,"column":1}},"html":"f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dfi(x,u)dxj,i,jnx(uf(x,u))ij=dfi(x,u)duj,inx,jnu\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dxjdfi(x,u),i,jnx(uf(x,u))ij=dujdfi(x,u),inx,jnu","enumerator":"2.43","key":"YVWoDUERfk"},{"type":"paragraph","position":{"start":{"line":895,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"and quadratizing the cost function around\n","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"v0oFErH8Jk"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"xoPWsbAUI4"},{"type":"text","value":" gives:","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"NhVsvTd3if"}],"key":"ElAorhwPQj"},{"type":"math","value":"\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":908,"column":1}},"html":"c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+12(xx)xxc(x,u)(xx)+12(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)}quadratic terms\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+21(xx)xxc(x,u)(xx)+21(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)quadratic terms","enumerator":"2.44","key":"qAzuT7C674"},{"type":"paragraph","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"where the gradients and Hessians are defined as","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"key":"UQrJy8QjY6"}],"key":"R2eagbwm9g"},{"type":"math","value":"\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}","position":{"start":{"line":913,"column":1},"end":{"line":921,"column":1}},"html":"(xc(x,u))i=dc(x,u)dxi,inx(uc(x,u))i=dc(x,u)dui,inu(xxc(x,u))ij=d2c(x,u)dxidxj,i,jnx(uuc(x,u))ij=d2c(x,u)duiduj,i,jnu(xuc(x,u))ij=d2c(x,u)dxiduj.inx,jnu\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}(xc(x,u))i(xxc(x,u))ij(xuc(x,u))ij=dxidc(x,u),inx=dxidxjd2c(x,u),i,jnx=dxidujd2c(x,u).inx,jnu(uc(x,u))i(uuc(x,u))ij=duidc(x,u),inu=duidujd2c(x,u),i,jnu","enumerator":"2.45","key":"yeotyWWusV"},{"type":"paragraph","position":{"start":{"line":925,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"strong","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"rc0GIHW8Ut"}],"key":"VmzbEaIHv1"},{"type":"text","value":" Note that this cost can be expressed in the general\nquadratic form seen in\n","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"YHAeoIfobP"},{"type":"crossReference","kind":"equation","identifier":"general_quadratic_cost","label":"general_quadratic_cost","children":[{"type":"text","value":"(","key":"RWo47nSt9K"},{"type":"text","value":"2.38","key":"LsVGvTPZ02"},{"type":"text","value":")","key":"WHAo3NKY0K"}],"template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"Gh2JCpXMCI"},{"type":"text","value":". Derive the corresponding\nquantities ","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"zgvHB913xY"},{"type":"inlineMath","value":"Q, R, M, q, r, c","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"html":"Q,R,M,q,r,cQ, R, M, q, r, cQ,R,M,q,r,c","key":"IBMTS4vOSw"},{"type":"text","value":".","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"CtElK04ds5"}],"key":"PgTfUKzSEZ"},{"type":"heading","depth":3,"position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Finite differencing","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"iK7PphstuY"}],"identifier":"finite-differencing","label":"Finite differencing","html_id":"finite-differencing","implicit":true,"enumerator":"2.6.2","key":"srK0lb4ytz"},{"type":"paragraph","position":{"start":{"line":932,"column":1},"end":{"line":936,"column":1}},"children":[{"type":"text","value":"To calculate these gradients and Hessians in practice,\nwe use a method known as ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"ljJO34eDyu"},{"type":"strong","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"finite differencing","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"cTIVOi33Zh"}],"key":"V4ENUMuhp1"},{"type":"text","value":" for numerically computing derivatives.\nNamely, we can simply use the limit definition of the derivative, and\nsee how the function changes as we add or subtract a tiny ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"V4KBz22iLM"},{"type":"text","value":"δ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"nBb6tJ3Iwm"},{"type":"text","value":" to\nthe input.","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"rLDIUZML0F"}],"key":"aH5m0LS0OF"},{"type":"math","value":"\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}","position":{"start":{"line":939,"column":1},"end":{"line":941,"column":1}},"html":"ddxf(x)=limδ0f(x+δ)f(x)δ\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}dxdf(x)=δ0limδf(x+δ)f(x)","enumerator":"2.46","key":"IhJjX3J9VH"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":946,"column":1}},"children":[{"type":"text","value":"Note that this only requires us to be able to ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"xCou6WIgQp"},{"type":"emphasis","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"query","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"SlmFJXJBvU"}],"key":"qoY2Ept2oN"},{"type":"text","value":" the function, not\nto have an analytical expression for it, which is why it’s so useful in\npractice.","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"JmGnCvVJgX"}],"key":"p0fR7pDL5g"},{"type":"heading","depth":3,"position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Local convexification","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"cg4lvlZ7qv"}],"identifier":"local-convexification","label":"Local convexification","html_id":"local-convexification","implicit":true,"enumerator":"2.6.3","key":"tD3hgTJ3iL"},{"type":"paragraph","position":{"start":{"line":950,"column":1},"end":{"line":953,"column":1}},"children":[{"type":"text","value":"However, simply taking the second-order approximation of the cost\nfunction is insufficient, since for the LQR setup we required that the\n","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"PLDlckpOxa"},{"type":"inlineMath","value":"Q","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"QQQ","key":"DgXSADD7Z7"},{"type":"text","value":" and ","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"JwshxoUApj"},{"type":"inlineMath","value":"R","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"RRR","key":"wiED909In9"},{"type":"text","value":" matrices were positive definite, i.e. that all of their\neigenvalues were positive.","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"TlwkgZIgyA"}],"key":"zkowi6Nf8C"},{"type":"paragraph","position":{"start":{"line":955,"column":1},"end":{"line":960,"column":1}},"children":[{"type":"text","value":"One way to naively ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"nvKCXY1tUr"},{"type":"emphasis","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"children":[{"type":"text","value":"force","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"RidQc7IMge"}],"key":"Lua9yAt97A"},{"type":"text","value":" some symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"CXR571neor"},{"type":"inlineMath","value":"D","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DDD","key":"Zc3aJvQ4yq"},{"type":"text","value":" to be positive definite\nis to set any non-positive eigenvalues to some small positive value ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"sWxFtO4p8Z"},{"type":"inlineMath","value":"\\varepsilon > 0","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"ε>0\\varepsilon > 0ε>0","key":"nl9e5tvX6O"},{"type":"text","value":".\nRecall that any real symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"g5SRmQNBU7"},{"type":"inlineMath","value":"D \\in \\mathbb{R}^{n \\times n}","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DRn×nD \\in \\mathbb{R}^{n \\times n}DRn×n","key":"pnF0H4Z6BU"},{"type":"text","value":" has an basis of eigenvectors ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"eA89HQCJ0U"},{"type":"inlineMath","value":"u_1, \\dots, u_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"u1,,unu_1, \\dots, u_nu1,,un","key":"NOZfOo2cl6"},{"type":"text","value":"\nwith corresponding eigenvalues ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"LWD8Lpd3vh"},{"type":"inlineMath","value":"\\lambda_1, \\dots, \\lambda_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"λ1,,λn\\lambda_1, \\dots, \\lambda_nλ1,,λn","key":"slDVaYIUbN"},{"type":"text","value":"\nsuch that ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"PzJuBnYAS4"},{"type":"inlineMath","value":"D u_i = \\lambda_i u_i","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"Dui=λiuiD u_i = \\lambda_i u_iDui=λiui","key":"GL9wv8uzbb"},{"type":"text","value":".\nThen we can construct the positive definite approximation by","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"tOPN7Pdcui"}],"key":"DGwEb40Quq"},{"type":"math","value":"\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.","position":{"start":{"line":962,"column":1},"end":{"line":964,"column":1}},"html":"D~=(i=1,,nλi>0λiuiui)+εI.\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.D=i=1,,nλi>0λiuiui+εI.","enumerator":"2.47","key":"nDsPoiMmwY"},{"type":"paragraph","position":{"start":{"line":968,"column":1},"end":{"line":969,"column":1}},"children":[{"type":"strong","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"VTBUFDfaHR"}],"key":"yXUGKANYkY"},{"type":"text","value":" Convince yourself that ","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"zkVw7CAcfC"},{"type":"inlineMath","value":"\\widetilde{D}","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"html":"D~\\widetilde{D}D","key":"oom2iVSyg6"},{"type":"text","value":" is indeed positive\ndefinite.","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"yImIMkBCne"}],"key":"pG5Z367Qsi"},{"type":"paragraph","position":{"start":{"line":971,"column":1},"end":{"line":977,"column":1}},"children":[{"type":"text","value":"Note that Hessian matrices are generally symmetric, so we can apply this\nprocess to ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"eldicEnIu0"},{"type":"inlineMath","value":"Q","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"QQQ","key":"UESKdCE9r2"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"hgISRhfxh2"},{"type":"inlineMath","value":"R","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"RRR","key":"qx7paAdBYe"},{"type":"text","value":" to obtain the positive definite approximations\n","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"wQ3Jht51L3"},{"type":"inlineMath","value":"\\widetilde{Q}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"Q~\\widetilde{Q}Q","key":"QYdElC9XxQ"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"ccoOeXzfTd"},{"type":"inlineMath","value":"\\widetilde{R}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"R~\\widetilde{R}R","key":"WIEUVTUtUQ"},{"type":"text","value":".\nNow that we have an upward-curved\nquadratic approximation to the cost function, and a linear approximation\nto the state transitions, we can simply apply the time-homogenous LQR\nmethods from ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"JhKB1T3zsg"},{"type":"crossReference","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"children":[{"type":"text","value":"Section ","key":"NCOrdaTYC5"},{"type":"text","value":"2.4","key":"sUtaMK4ecY"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"PjhR43r58C"},{"type":"text","value":".","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"Wuid7fv8gJ"}],"key":"z8ip1fStN4"},{"type":"paragraph","position":{"start":{"line":979,"column":1},"end":{"line":983,"column":1}},"children":[{"type":"text","value":"But what happens when we enter states far away from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"fwmyIlhC08"},{"type":"inlineMath","value":"\\st^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"x\\st^\\starx","key":"nEfQ2BEvOU"},{"type":"text","value":" or want\nto use actions far from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"tpAo5e5vDB"},{"type":"inlineMath","value":"\\act^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"u\\act^\\staru","key":"HOnhEosy8E"},{"type":"text","value":"? A Taylor approximation is only\naccurate in a ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"otBAPs7nAk"},{"type":"emphasis","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"local","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"cGVByoXFdY"}],"key":"MazjOCtvW3"},{"type":"text","value":" region around the point of linearization, so the\nperformance of our LQR controller will degrade as we move further away.\nWe’ll see how to address this in the next section using the ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"vJYLaBMsrm"},{"type":"strong","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"zawpVclttI"}],"key":"vAVam7I4YH"},{"type":"text","value":" algorithm.","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"p4sC9s1De2"}],"key":"GxIR0Mb5LP"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.png","alt":"Local linearization might only be accurate in a small region around the\npoint of linearization.","data":{"altTextIsAutoGenerated":true},"key":"UmtnrEiEk4","urlSource":"shared/log_taylor.png","urlOptimized":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"local_linearization","identifier":"local_linearization","html_id":"local-linearization","enumerator":"2.3","children":[{"type":"text","value":"Figure ","key":"s5OG83nY92"},{"type":"text","value":"2.3","key":"zBD17Ge67K"},{"type":"text","value":":","key":"toz9TiJxTD"}],"template":"Figure %s:","key":"NeUDXpx3k9"},{"type":"text","value":"Local linearization might only be accurate in a small region around the\npoint of linearization.","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"jsxQXzobPq"}],"key":"DANgYCYlR2"}],"key":"BIgB6ErPOY"}],"label":"local_linearization","identifier":"local_linearization","enumerator":"2.3","html_id":"local-linearization","key":"f0kXqI10K4"},{"type":"heading","depth":3,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"HcgqPcRS44"}],"label":"iterative_lqr","identifier":"iterative_lqr","html_id":"iterative-lqr","enumerator":"2.6.4","key":"TqNQ5Vcvx5"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"To address these issues with local linearization, we’ll use an iterative\napproach, where we repeatedly linearize around different points to\ncreate a ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"zSUJvTmOiv"},{"type":"emphasis","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"oOdM4QJTfW"}],"key":"eEVvXi7GQl"},{"type":"text","value":" approximation of the dynamics, and then solve\nthe resulting time-dependent LQR problem to obtain a better policy. This\nis known as ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"PDRhXmYcPH"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"uXC0zhuRSm"}],"key":"mKUWiybstb"},{"type":"text","value":" or ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Ys6xMqMv0L"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iLQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"wFUQpHlcN0"}],"key":"Mk5tx71hH1"},{"type":"text","value":":","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"rSzSevRe9w"}],"key":"H1VLlYMkdN"},{"type":"proof","kind":"definition","label":"ilqr","identifier":"ilqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"sab7wVc52x"}],"key":"b9mE6w9wnQ"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"For each iteration of the algorithm:","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"BdZi3u1MVW"}],"key":"vuDwavxmFG"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":1006,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1006,"column":1},"end":{"line":1007,"column":1}},"children":[{"type":"text","value":"Form a time-dependent LQR problem around the current candidate\ntrajectory using local linearization.","position":{"start":{"line":1006,"column":1},"end":{"line":1006,"column":1}},"key":"FnsYYXbk3v"}],"key":"ZhQ3aVxYfX"},{"type":"listItem","spread":true,"position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Compute the optimal policy using ","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"LCAqh4y22e"},{"type":"crossReference","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Section ","key":"qpzt6GOe25"},{"type":"text","value":"2.5.1","key":"LZnbaMe2Ui"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"UTFj4MKioY"},{"type":"text","value":".","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"AOESNZhPmj"}],"key":"OvWnBu8tBl"},{"type":"listItem","spread":true,"position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"children":[{"type":"text","value":"Generate a new series of actions using this policy.","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"FIApQGnNKk"}],"key":"hhEIRf0pSR"},{"type":"listItem","spread":true,"position":{"start":{"line":1010,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"Compute a better candidate trajectory by interpolating between the\ncurrent and proposed actions.","position":{"start":{"line":1010,"column":1},"end":{"line":1010,"column":1}},"key":"LjHFOtviEg"}],"key":"uf9lBX89Bu"}],"key":"nmNISUeT9y"}],"enumerator":"2.9","html_id":"ilqr","key":"o11eGS8IJb"},{"type":"paragraph","position":{"start":{"line":1014,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Now let’s go through the details of each step. We’ll use superscripts to\ndenote the iteration of the algorithm. We’ll also denote\n","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"Pfz1Ngqz01"},{"type":"inlineMath","value":"\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"html":"xˉ0=Ex0μ0[x0]\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]xˉ0=Ex0μ0[x0]","key":"lx5yzxtgNZ"},{"type":"text","value":" as the expected initial\nstate.","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"XWYMy8wsQ5"}],"key":"hCGbzHqCsn"},{"type":"paragraph","position":{"start":{"line":1019,"column":1},"end":{"line":1021,"column":1}},"children":[{"type":"text","value":"At iteration ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"WR6zg6ghLy"},{"type":"inlineMath","value":"i","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"iii","key":"HaoqJNL4HQ"},{"type":"text","value":" of the algorithm, we begin with a ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"m6NW0U2gxd"},{"type":"strong","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"children":[{"type":"text","value":"candidate","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"Tm1SVlIvTz"}],"key":"iPYSVhfidq"},{"type":"text","value":"\ntrajectory\n","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"jGmNQsx0wF"},{"type":"inlineMath","value":"\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)","key":"Ol9Sw7hhR1"},{"type":"text","value":".","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"W3Z3Rrv04i"}],"key":"eaWIeGobOQ"},{"type":"paragraph","position":{"start":{"line":1023,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Step 1: Form a time-dependent LQR problem.","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"KP1HvDx4XS"}],"key":"CHb41n9VTB"},{"type":"text","value":" At each timestep\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"T0NepsnWZM"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"Y5czPP4rQZ"},{"type":"text","value":", we use the techniques from\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"W53IJMO7zS"},{"type":"crossReference","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Section ","key":"oNhka1zJSU"},{"type":"text","value":"2.6","key":"INoHGnMxEe"}],"identifier":"approx_nonlinear","label":"approx_nonlinear","kind":"heading","template":"Section %s","enumerator":"2.6","resolved":true,"html_id":"approx-nonlinear","key":"UmIp07XaBw"},{"type":"text","value":" to linearize the dynamics and\nquadratize the cost function around ","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"qDwTiuRVGU"},{"type":"inlineMath","value":"(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"(xˉhi,uˉhi)(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)(xˉhi,uˉhi)","key":"EKA7L8JUF9"},{"type":"text","value":":","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"wExYXFZWvg"}],"key":"ivr6fHd4mV"},{"type":"math","value":"\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}","position":{"start":{"line":1029,"column":1},"end":{"line":1049,"column":1}},"html":"fh(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)ch(x,u)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+12[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}fh(x,u)ch(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+21[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].","enumerator":"2.48","key":"yXnuGgzYDU"},{"type":"paragraph","position":{"start":{"line":1053,"column":1},"end":{"line":1056,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Step 2: Compute the optimal policy.","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"aLp9qVccx8"}],"key":"xPfpmpcc3N"},{"type":"text","value":" We can now solve the\ntime-dependent LQR problem using the Riccati equation from\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"kv4rrlMFHq"},{"type":"crossReference","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Section ","key":"c5dPmJ4c6E"},{"type":"text","value":"2.5.1","key":"xYeaCyY05u"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"oNLQ87TTnK"},{"type":"text","value":" to compute the optimal policy\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"fIWk0KWfPH"},{"type":"inlineMath","value":"\\pi^i_0, \\dots, \\pi^i_{\\hor-1}","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"html":"π0i,,πH1i\\pi^i_0, \\dots, \\pi^i_{\\hor-1}π0i,,πH1i","key":"l1kFQNMXHW"},{"type":"text","value":".","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"aYuiJyjQ6L"}],"key":"If8vM20mVe"},{"type":"paragraph","position":{"start":{"line":1058,"column":1},"end":{"line":1059,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"children":[{"type":"text","value":"Step 3: Generate a new series of actions.","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"kK5R3L5RCp"}],"key":"i3Gf7mZWtc"},{"type":"text","value":" We can then generate a new\nsample trajectory by taking actions according to this optimal policy:","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"Xv1ruqtUGw"}],"key":"vHtT0FnQey"},{"type":"math","value":"\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"xˉ0i+1=xˉ0,u~h=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,u~h).\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).xˉ0i+1=xˉ0,uh=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,uh).","enumerator":"2.49","key":"JDJ43JXyqP"},{"type":"paragraph","position":{"start":{"line":1067,"column":1},"end":{"line":1068,"column":1}},"children":[{"type":"text","value":"Note that the states are sampled according to the ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"whmKYiKmOZ"},{"type":"emphasis","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"children":[{"type":"text","value":"true","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"L0bwVHLJaG"}],"key":"jgU2pT3J3T"},{"type":"text","value":" dynamics, which\nwe assume we have query access to.","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"JCTDLrUkQ9"}],"key":"OuPWx8Qybe"},{"type":"paragraph","position":{"start":{"line":1070,"column":1},"end":{"line":1077,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"Step 4: Compute a better candidate trajectory.","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"dLd4FEqo4y"}],"key":"ZdVJ8JkMHZ"},{"type":"text","value":", Note that we’ve\ndenoted these actions as ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"fKoLkmLVgX"},{"type":"inlineMath","value":"\\widetilde \\act_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"u~h\\widetilde \\act_\\hiuh","key":"paUGVoZGiR"},{"type":"text","value":" and aren’t directly using\nthem for the next iteration ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"MrujkOzWCi"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉhi+1\\bar \\act^{i+1}_\\hiuˉhi+1","key":"MF2aiqMwL6"},{"type":"text","value":". Rather, we want to\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"VqxKgrYmX5"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"interpolate","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"x6rNpNsuzO"}],"key":"VeN4HZjrjD"},{"type":"text","value":" between them and the actions from the previous iteration\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"R0MGTAjRXD"},{"type":"inlineMath","value":"\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉ0i,,uˉH1i\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}uˉ0i,,uˉH1i","key":"YwErKoJ2J9"},{"type":"text","value":". This is so that the cost\nwill ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"XZqYZgGEzP"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"increase monotonically,","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"xRRtXBUOMh"}],"key":"kVheV701vU"},{"type":"text","value":" since if the new policy turns out to\nactually be worse, we can stay closer to the previous trajectory. (Can\nyou think of an intuitive example where this might happen?)","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"CKOd0R4fOi"}],"key":"l3rAl7ayBy"},{"type":"paragraph","position":{"start":{"line":1079,"column":1},"end":{"line":1082,"column":1}},"children":[{"type":"text","value":"Formally, we want to find ","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"DrSRxstLY1"},{"type":"inlineMath","value":"\\alpha \\in [0, 1]","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"α[0,1]\\alpha \\in [0, 1]α[0,1]","key":"IynuSU1dPD"},{"type":"text","value":" to generate the next\niteration of actions\n","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"azTk4GwAYK"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"uˉ0i+1,,uˉH1i+1\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}uˉ0i+1,,uˉH1i+1","key":"ICKYjmRB78"},{"type":"text","value":" such that the cost\nis minimized:","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"pdkCp3L9Zk"}],"key":"gM406weRLS"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}","position":{"start":{"line":1084,"column":1},"end":{"line":1091,"column":1}},"html":"minα[0,1]h=0H1c(xh,uˉhi+1)wherexh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)u~hx0=xˉ0.\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}α[0,1]minwhereh=0H1c(xh,uˉhi+1)xh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)uhx0=xˉ0.","enumerator":"2.50","key":"ztPinY9uyq"},{"type":"paragraph","position":{"start":{"line":1093,"column":1},"end":{"line":1095,"column":1}},"children":[{"type":"text","value":"Note that this optimizes over the closed interval\n","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"FgR3JhJntd"},{"type":"inlineMath","value":"[0, 1]","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"html":"[0,1][0, 1][0,1]","key":"CXmwfvaVe8"},{"type":"text","value":", so by the Extreme Value Theorem, it’s guaranteed to have a\nglobal maximum.","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"QlkC7o6fXe"}],"key":"A7J5nhBHux"},{"type":"paragraph","position":{"start":{"line":1097,"column":1},"end":{"line":1101,"column":1}},"children":[{"type":"text","value":"The final output of this algorithm is a policy ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"w1n40z9L1m"},{"type":"inlineMath","value":"\\pi^{n_\\text{steps}}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"πnsteps\\pi^{n_\\text{steps}}πnsteps","key":"N32BQ337KZ"},{"type":"text","value":"\nderived after ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"w6FenIoM5q"},{"type":"inlineMath","value":"n_\\text{steps}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"nstepsn_\\text{steps}nsteps","key":"V0UCFT6Djb"},{"type":"text","value":" of the algorithm. Though the proof is\nsomewhat complex, one can show that for many nonlinear control problems,\nthis solution converges to a locally optimal solution (in the policy\nspace).","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"dTwPP6O6b8"}],"key":"lNGt0PZyvI"},{"type":"heading","depth":2,"position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"key":"mm80d5oLp9"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"2.7","key":"VL7QY0LpXM"},{"type":"paragraph","position":{"start":{"line":1105,"column":1},"end":{"line":1112,"column":1}},"children":[{"type":"text","value":"This chapter introduced some approaches to solving different variants of\nthe optimal control problem\n","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"L0gpQtdQho"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"i4o7ZJV6zY"},{"type":"text","value":"2.1","key":"NLNS94V2OF"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"C9rX2dRV7m"},{"type":"text","value":". We began with the simple case of linear\ndynamics and an upward-curved quadratic cost. This model is called the\nLQR and we solved for the optimal policy using dynamic programming. We\nthen extended these results to the more general nonlinear case via local\nlinearization. We finally saw the iterative LQR algorithm for solving\nnonlinear control problems.","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"uCn7vNdJwY"}],"key":"SrKjiY8J4E"}],"key":"Godf9BJFef"}],"key":"vHaXFYpcGJ"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/exploration.html b/exploration.html index 0f8bba1..b1445fe 100644 --- a/exploration.html +++ b/exploration.html @@ -14,18 +14,18 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

where NtkN_t^k indicates the number of times arm kk has been pulled up until time tt, RtkR_t^k indicates the total reward obtained by pulling arm kk up until time tt, and δ>0\delta > 0 controls the width of the confidence interval. How might we extend UCB to the MDP case?

Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which policy is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of arms as policies. There are K=(AS)HK = (|\mathcal{A}|^{|\mathcal{S}|})^\hor deterministic policies in a finite MDP. Then, “pulling” arm π corresponds to using π to act through a trajectory in the MDP, and observing the total reward.

Recall that UCB incurs regret O~(TK)\tilde{O}(\sqrt{TK})

This scales exponentially in S|\mathcal{S}| and H\hor, which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:

9.3UCB-VI

The approach above is inefficient: We shouldn’t need to consider all ASH|\mathcal{A}|^{|\mathcal{S}| H} deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is QQ^\star, which has HSAH |\mathcal{S}||\mathcal{A}| entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH)?

One way to frame the UCB algorithm is that, when choosing arms, we optimize over a proxy reward that is the sum of the estimated mean reward and an exploration term. In the UCB-VI algorithm, we will extend this idea to the case of an unknown MDP M?\mathcal{M}^{?} by modelling a proxy MDP M~\tilde{\mathcal{M}} with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in M~\tilde{\mathcal{M}}.

Assumptions: For simplicity, here we assume the reward function of M?\mathcal{M}^{?} is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a time-varying MDP, where the transition and reward functions can change over time. We take the convention that PhP_\hi is the distribution of sh+1sh,ahs_{h+1} \mid s_{h}, a_{h} and rhr_\hi is applied to sh,ahs_\hi, a_\hi.

At a high level, the UCB-VI algorithm can be described as follows:

  1. Modelling: Use previous data to model the transitions P^0,,P^H1\hat{P}_0, \dots, \hat{P}_{H-1}.

  2. Reward bonus: Design a reward bonus bh(s,a)Rb_\hi(s, a) \in \mathbb{R} to encourage exploration, analogous to the UCB term.

  3. Optimistic planning: Use DP to compute the optimal policy π^h(s)\hat \pi_\hi(s) in the modelled MDP

M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).\tilde{\mathcal{M}} = (\mathcal{S}, \mathcal{A}, \{ \hat{P}_\hi \}_{h \in [H]}, \{ r_\hi + b_\hi \}_{h \in [H]}, H).
  1. Execution: Use π^h(s)\hat \pi_\hi(s) to collect a new trajectory, and repeat.

We detail each of these steps below. The full definition follows in (9.16).

9.3.1Modelling the transitions

We seek to approximate Ph(sh+1sh,ah)=P(sh,ah,sh+1)P(sh,ah)P_\hi(s_{h+1} \mid s_\hi, a_\hi) = \frac{\pr(s_\hi, a_\hi, s_{h+1})}{\pr(s_\hi, a_\hi)}. We can estimate these using their sample probabilities from the dataset. That is, define

Nht(s,a,s):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}Nht(s,a):=i=0t11{(shi,ahi)=(s,a)}\begin{aligned} +M1001 80h400000v40h-400000z'/>)

This scales exponentially in S|\mathcal{S}| and H\hor, which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:

9.3UCB-VI

The approach above is inefficient: We shouldn’t need to consider all ASH|\mathcal{A}|^{|\mathcal{S}| H} deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is QQ^\star, which has HSAH |\mathcal{S}||\mathcal{A}| entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH)?

One way to frame the UCB algorithm is that, when choosing arms, we optimize over a proxy reward that is the sum of the estimated mean reward and an exploration term. In the UCB-VI algorithm, we will extend this idea to the case of an unknown MDP M?\mathcal{M}^{?} by modelling a proxy MDP M~\tilde{\mathcal{M}} with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in M~\tilde{\mathcal{M}}.

Assumptions: For simplicity, here we assume the reward function of M?\mathcal{M}^{?} is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a time-varying MDP, where the transition and reward functions can change over time. We take the convention that PhP_\hi is the distribution of sh+1sh,ahs_{h+1} \mid s_{h}, a_{h} and rhr_\hi is applied to sh,ahs_\hi, a_\hi.

At a high level, the UCB-VI algorithm can be described as follows:

  1. Modelling: Use previous data to model the transitions P^0,,P^H1\hat{P}_0, \dots, \hat{P}_{H-1}.

  2. Reward bonus: Design a reward bonus bh(s,a)Rb_\hi(s, a) \in \mathbb{R} to encourage exploration, analogous to the UCB term.

  3. Optimistic planning: Use DP to compute the optimal policy π^h(s)\hat \pi_\hi(s) in the modelled MDP

M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).\tilde{\mathcal{M}} = (\mathcal{S}, \mathcal{A}, \{ \hat{P}_\hi \}_{h \in [H]}, \{ r_\hi + b_\hi \}_{h \in [H]}, H).
  1. Execution: Use π^h(s)\hat \pi_\hi(s) to collect a new trajectory, and repeat.

We detail each of these steps below. The full definition follows in (9.16).

9.3.1Modelling the transitions

We seek to approximate Ph(sh+1sh,ah)=P(sh,ah,sh+1)P(sh,ah)P_\hi(s_{h+1} \mid s_\hi, a_\hi) = \frac{\pr(s_\hi, a_\hi, s_{h+1})}{\pr(s_\hi, a_\hi)}. We can estimate these using their sample probabilities from the dataset. That is, define

Nht(s,a,s):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}Nht(s,a):=i=0t11{(shi,ahi)=(s,a)}\begin{aligned} N_\hi^t(s, a, s') & := \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i, s_{h+1}^i) = (s, a, s') } \\ N_\hi^t(s, a) & := \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i) = (s, a) } \\ -\end{aligned}

Then we can model

P^ht(ss,a)=Nht(s,a,s)Nht(s,a).\hat{P}_\hi^t(s' \mid s, a) = \frac{N_\hi^t(s, a, s')}{N_\hi^t(s, a)}.

9.3.2Reward bonus

To motivate the reward bonus term bht(s,a)b_\hi^t(s, a), recall how we designed the reward bonus term for UCB:

  1. We used Hoeffding’s inequality to bound, with high probability, how far the sample mean μ^tk\hat \mu_t^k deviated from the true mean μk\mu^k.

  2. By inverting this inequality, we obtained a (1δ)(1-\delta)-confidence interval for the true mean, centered at our estimate.

  3. To make this bound uniform across all timesteps t[T]t \in [T], we applied the union bound and multiplied δ by a factor of TT.

We’d like to do the same for UCB-VI, and construct the bonus term such that Vh(s)V^ht(s)V^\star_\hi(s) \le \hat{V}_\hi^t(s) with high probability. However, our construction will be more complex than the MAB case, since V^ht(s)\hat{V}_\hi^t(s) depends on the bonus bht(s,a)b_\hi^t(s, a) implicitly via DP. We claim that the bonus term that gives the proper bound is

bht(s,a)=2Hlog(SAHT/δ)Nht(s,a).b_\hi^t(s, a) = 2 H \sqrt{\frac{\log( |\mathcal{S}||\mathcal{A}|H T/\delta )}{N_\hi^t(s, a)}}.

We will only provide a heuristic sketch of the proof; see Agarwal et al. (2022) (Section 7.3) for a full proof.

Comparing this to the UCB regret bound O~(TK)\tilde{O}(\sqrt{T K}), where KK is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from ASH|\mathcal{A}|^{|\mathcal{S}|\hor} (in (9.4)) to H4SAH^4 |\mathcal{S}||\mathcal{A}|, which is indeed polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:

1TE[RegretT]=O~(H4SAT)\frac{1}{T} \E[\text{Regret}_T] = \tilde{O}\left(\sqrt{\frac{H^4 |\mathcal{S}||\mathcal{A}|}{T}}\right), where KK is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from ASH|\mathcal{A}|^{|\mathcal{S}|\hor} (in (9.4)) to H4SAH^4 |\mathcal{S}||\mathcal{A}|, which is indeed polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:

1TE[RegretT]=O~(H4SAT)\frac{1}{T} \E[\text{Regret}_T] = \tilde{O}\left(\sqrt{\frac{H^4 |\mathcal{S}||\mathcal{A}|}{T}}\right)

Note that the time-dependent transition matrix has HS2AH |\mathcal{S}|^2 |\mathcal{A}| entries. Assuming HSH \ll |\mathcal{S}|, this shows that it’s possible to achieve low regret, and achieve a near-optimal policy, while only understanding a 1/S1/|\mathcal{S}| fraction of the world’s dynamics.

9.4Linear MDPs

A polynomial dependency on S|\mathcal{S}| and A|\mathcal{A}| is manageable when the state and action spaces are small. But for large or continuous state and action spaces, even this polynomial factor will become intractable. Can we find algorithms that don’t depend on S|\mathcal{S}| or A|\mathcal{A}| at all, effectively reducing the dimensionality of the MDP? In this section, we’ll explore linear MDPs: an example of a parameterized MDP where the rewards and state transitions depend only on some parameter space of dimension dd that is independent from S|\mathcal{S}| or A|\mathcal{A}|.

9.4.1Planning in a linear MDP

It turns out that QhQ^\star_\hi is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize VH(s)=0sV_{H}^\star(s) = 0 \forall s. Then we iterate:

Qh(s,a)=rh(s,a)+EsPh(s,a)[Vh+1(s)]=ϕ(s,a)θh+(μhϕ(s,a))Vh+1=ϕ(s,a)(θh+(μh)Vh+1)whVh(s)=maxaQh(s,a)πh(s)=argmaxaQh(s,a)\begin{aligned} Q^\star_\hi(s, a) & = r_\hi(s, a) + \E_{s' \sim P_\hi(\cdot \mid s, a)} [V^\star_{h+1}(s')] \\ & = \phi(s, a)^\top \theta_\hi^\star + (\mu_\hi^\star \phi(s, a))^\top V^\star_{h+1} \\ & = \phi(s, a)^\top \underbrace{( \theta_\hi^\star + (\mu_\hi^\star)^\top V^\star_{h+1})}_{w_\hi} \\ @@ -129,10 +129,10 @@ 11.7-311.7 78.3-403 201-6 8-9.7 12-11 12-.7.7-6.7 1-18 1s-17.3-.3-18-1c-1.3 0 -5-4-11-12-44.7-59.3-101.3-106.3-170-141s-145.3-54.3-229-60H0V214z'/>(θh+(μh)Vh+1)=amaxQh(s,a)=argamaxQh(s,a)

9.4.2UCB-VI in a linear MDP

9.4.2.1Modelling the transitions

This linear assumption on the MDP will also allow us to model the unknown dynamics Ph?(ss,a)P^?_\hi(s' \mid s, a) with techniques from supervised learning (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of Ph?(ss,a)P^?_\hi(s' \mid s, a) as a least-squares problem as follows: Write δs\delta_s to denote a one-hot vector in RS\mathbb{R}^{|\mathcal{S}|}, with a 1 in the ss-th entry and 0 everywhere else. Note that

EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).\E_{s' \sim P_h(\cdot \mid s, a)} [\delta_{s'}] = P_h(\cdot \mid s, a) = \mu_h^\star \phi(s, a).

Furthermore, since the expectation here is linear with respect to ϕ(s,a)\phi(s, a), we can directly apply least-squares multi-target linear regression to construct the estimate

μ^=argminμRS×dt=0T1μϕ(shi,ahi)δsh+1i22.\hat \mu = \arg\min_{\mu \in \mathbb{R}^{|\mathcal{S}| \times d}} \sum_{t=0}^{T-1} \|\mu \phi(s_h^i, a_h^i) - \delta_{s_{h+1}^i} \|_2^2.

This has a well-known closed-form solution:

μ^=(Aht)1i=0t1ϕ(shi,ahi)δsh+1iwhereAht=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI\begin{aligned} +-174 2.7-5 6-9 10-13 .7-1 7.3-1 20-1h17z'/>(θh+(μh)Vh+1)=amaxQh(s,a)=argamaxQh(s,a)

9.4.2UCB-VI in a linear MDP

9.4.2.1Modelling the transitions

This linear assumption on the MDP will also allow us to model the unknown dynamics Ph?(ss,a)P^?_\hi(s' \mid s, a) with techniques from supervised learning (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of Ph?(ss,a)P^?_\hi(s' \mid s, a) as a least-squares problem as follows: Write δs\delta_s to denote a one-hot vector in RS\mathbb{R}^{|\mathcal{S}|}, with a 1 in the ss-th entry and 0 everywhere else. Note that

EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).\E_{s' \sim P_h(\cdot \mid s, a)} [\delta_{s'}] = P_h(\cdot \mid s, a) = \mu_h^\star \phi(s, a).

Furthermore, since the expectation here is linear with respect to ϕ(s,a)\phi(s, a), we can directly apply least-squares multi-target linear regression to construct the estimate

μ^=argminμRS×dt=0T1μϕ(shi,ahi)δsh+1i22.\hat \mu = \arg\min_{\mu \in \mathbb{R}^{|\mathcal{S}| \times d}} \sum_{t=0}^{T-1} \|\mu \phi(s_h^i, a_h^i) - \delta_{s_{h+1}^i} \|_2^2.

This has a well-known closed-form solution:

μ^=(Aht)1i=0t1ϕ(shi,ahi)δsh+1iwhereAht=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI\begin{aligned} \hat \mu^\top & = (A_h^t)^{-1} \sum_{i=0}^{t-1} \phi(s_h^i, a_h^i) \delta_{s_{h+1}^i}^\top \\ \text{where} \quad A_h^t & = \sum_{i=0}^{t-1} \phi(s_h^i, a_h^i) \phi(s_h^i, a_h^i)^\top + \lambda I -\end{aligned}

where we include a λI\lambda I term to ensure that the matrix AhtA^t_h is invertible. (This can also be derived by adding a λμF2\lambda \|\mu\|_{\text{F}}^2 regularization term to the objective.) We can directly plug in this estimate into P^ht(s,a)=μ^htϕ(s,a)\hat{P}^t_h(\cdot \mid s, a) = \hat \mu^t_h \phi(s, a).

9.4.2.2Reward bonus

Now, to design the reward bonus, we can’t apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we’re incorporating information across different states and actions. Rather, we can construct an upper bound using Chebyshev’s inequality in the same way we did for the LinUCB algorithm in the MAB setting Section 3.8.1:

bht(s,a)=βϕ(s,a)(Aht)1ϕ(s,a),β=O~(dH).b^t_\hi(s, a) = \beta \sqrt{\phi(s, a)^\top (A^t_h)^{-1} \phi(s, a)}, \quad \beta = \tilde O(d \hor).

Note that this isn’t explicitly inversely proportional to Nht(s,a)N_h^t(s, a) as in the original UCB-VI bonus term (9.8). Rather, it is inversely proportional to the amount that the direction ϕ(s,a)\phi(s, a) has been explored in the history. That is, if AhtA_h^t has a large component in the direction ϕ(s,a)\phi(s, a), implying that this direction is well explored, then the bonus term will be small, and vice versa.

We can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm (9.16).

9.4.2.3Performance

5 Fitted Dynamic Programming Algorithms

5.1Introduction

We borrow these definitions from the 1 Markov Decision Processes chapter:

from typing import NamedTuple, Callable, Optional
 from jaxtyping import Float, Array
 import jax.numpy as np
 from jax import grad, vmap
@@ -60,7 +60,7 @@
 
 def q_to_greedy(Q: QFunction) -> Policy:
     """Get the greedy policy for the given state-action value function."""
-    return lambda s, h: np.argmax(Q(s, h))

The 1 Markov Decision Processes chapter discussed the case of finite MDPs, where the state and action spaces S\mathcal{S} and A\mathcal{A} were finite. + return lambda s, h: np.argmax(Q(s, h))

The 1 Markov Decision Processes chapter discussed the case of finite MDPs, where the state and action spaces S\mathcal{S} and A\mathcal{A} were finite. This gave us a closed-form expression for computing the r.h.s. of the Bellman one-step consistency equation. In this chapter, we consider the case of large or continuous state spaces, where the state space is too large to be enumerated. In this case, we need to approximate the value function and Q-function using methods from supervised learning.

We will first take a quick detour to introduce the empirical risk minimization framework for function approximation. @@ -69,22 +69,22 @@ We seek to learn the relationship between some input variables xx and some output variable yy (drawn from their joint distribution). Precisely, we want to find a function f^:xy\hat f : x \mapsto y that minimizes the -squared error of the prediction:

f^=argminfE[(yf(x))2]\hat f = \arg\min_{f} \E[(y - f(x))^2]

An equivalent framing is that we seek to approximate the conditional expectation of yy given xx:

In most applications, the joint distribution of x,yx, y is unknown or extremely complex, and so we can’t analytically evaluate E[yx]\E [y \mid x]. Instead, our strategy is to draw NN samples (xi,yi)(x_i, y_i) from the joint distribution of xx and yy, and then use the sample average i=1N(yif(xi))2/N\sum_{i=1}^N (y_i - f(x_i))^2 / N to approximate the mean squared error. Then we use a fitting method to find a function f^\hat f that minimizes this objective and thus approximates the conditional expectation. -This approach is called empirical risk minimization.

5.3Fitted value iteration

Let us apply ERM to the RL problem of computing the optimal policy / value function.

How did we compute the optimal value function in MDPs with finite state and action spaces?

  • In a [](#finite-horizon MDP <finite_horizon_mdps>), we can use dynamic programming, working backwards from the end of the time horizon, to compute the optimal value function exactly.

  • In an [](#infinite-horizon MDP <infinite_horizon_mdps>), we can use [](#value iteration <value_iteration>), which iterates the Bellman optimality operator (1.54) to approximately compute the optimal value function.

Our existing approaches represent the value function, and the MDP itself, +This approach is called empirical risk minimization.

5.3Fitted value iteration

Let us apply ERM to the RL problem of computing the optimal policy / value function.

How did we compute the optimal value function in MDPs with finite state and action spaces?

  • In a [](#finite-horizon MDP <finite_horizon_mdps>), we can use dynamic programming, working backwards from the end of the time horizon, to compute the optimal value function exactly.

  • In an [](#infinite-horizon MDP <infinite_horizon_mdps>), we can use [](#value iteration <value_iteration>), which iterates the Bellman optimality operator (1.54) to approximately compute the optimal value function.

Our existing approaches represent the value function, and the MDP itself, in matrix notation. But what happens if the state space is extremely large, or even infinite (e.g. real-valued)? Then computing a weighted sum over all possible next states, which is required to compute the Bellman operator, becomes intractable.

Instead, we will need to use function approximation methods from supervised learning to solve for the value function in an alternative way.

In particular, suppose we have a dataset of NN trajectories τ1,,τNρπ\tau_1, \dots, \tau_N \sim \rho_{\pi} from some policy π (called the data collection policy) acting in the MDP of interest. -Let us indicate the trajectory index in the superscript, so that

τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\tau_i = \{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \dots, s_{\hor-1}^i, a_{\hor-1}^i, r_{\hor-1}^i \}.
def collect_data(
+Let us indicate the trajectory index in the superscript, so that

τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\tau_i = \{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \dots, s_{\hor-1}^i, a_{\hor-1}^i, r_{\hor-1}^i \}.
def collect_data(
     env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None
 ) -> list[Trajectory]:
     """Collect a dataset of trajectories from the given policy (or a random one)."""
@@ -102,22 +102,22 @@
                 break
             s = s_next
         trajectories.append(τ)
-    return trajectories
env = gym.make("LunarLander-v2")
+    return trajectories
env = gym.make("LunarLander-v2")
 trajectories = collect_data(env, 100, 300, key)
-trajectories[0][:5]  # show first five transitions from first trajectory
  0%|          | 0/100 [00:00<?, ?it/s]
  8%|▊         | 8/100 [00:00<00:01, 77.50it/s]
 17%|█▋        | 17/100 [00:00<00:00, 84.02it/s]
 26%|██▌       | 26/100 [00:00<00:00, 79.78it/s]
 35%|███▌      | 35/100 [00:00<00:00, 83.10it/s]
 45%|████▌     | 45/100 [00:00<00:00, 87.58it/s]
 55%|█████▌    | 55/100 [00:00<00:00, 90.24it/s]
 65%|██████▌   | 65/100 [00:00<00:00, 90.18it/s]
 75%|███████▌  | 75/100 [00:00<00:00, 92.43it/s]
 85%|████████▌ | 85/100 [00:00<00:00, 93.35it/s]
 95%|█████████▌| 95/100 [00:01<00:00, 91.99it/s]
100%|██████████| 100/100 [00:01<00:00, 89.71it/s]

+trajectories[0][:5]  # show first five transitions from first trajectory
  0%|          | 0/100 [00:00<?, ?it/s]
  9%|▉         | 9/100 [00:00<00:01, 85.78it/s]
 19%|█▉        | 19/100 [00:00<00:00, 90.32it/s]
 29%|██▉       | 29/100 [00:00<00:00, 77.90it/s]
 40%|████      | 40/100 [00:00<00:00, 87.71it/s]
 53%|█████▎    | 53/100 [00:00<00:00, 100.68it/s]
 65%|██████▌   | 65/100 [00:00<00:00, 103.10it/s]
 76%|███████▌  | 76/100 [00:00<00:00, 88.07it/s] 
 86%|████████▌ | 86/100 [00:00<00:00, 85.40it/s]
 95%|█████████▌| 95/100 [00:01<00:00, 83.88it/s]
100%|██████████| 100/100 [00:01<00:00, 88.19it/s]

 
[Transition(s=array([-0.00767412, 1.4020356 , -0.77731264, -0.39489663, 0.00889908, 0.17607279, 0. , 0. ], dtype=float32), a=np.int64(3), r=np.float64(0.01510799459859527)), Transition(s=array([-0.01526899, 1.392572 , -0.766254 , -0.42065707, 0.01559265, - 0.13388489, 0. , 0. ], dtype=float32), a=np.int64(2), r=np.float64(0.8023047305441764)), - Transition(s=array([-0.02275753, 1.3831123 , -0.75616544, -0.42051664, 0.02282397, - 0.1446398 , 0. , 0. ], dtype=float32), a=np.int64(1), r=np.float64(-2.094207819152159)), - Transition(s=array([-0.0303195 , 1.3730422 , -0.76537645, -0.4477334 , 0.03190061, - 0.18154952, 0. , 0. ], dtype=float32), a=np.int64(3), r=np.float64(-0.14640435408270377)), - Transition(s=array([-0.03779774, 1.3623788 , -0.7548636 , -0.4740972 , 0.03885893, - 0.13917959, 0. , 0. ], dtype=float32), a=np.int64(2), r=np.float64(0.4858343872478031))]

Can we view the dataset of trajectories as a “labelled dataset” in order to apply supervised learning to approximate the optimal Q-function? Yes! + 0.13388489, 0. , 0. ], dtype=float32), a=np.int64(0), r=np.float64(-0.9906126974697145)), + Transition(s=array([-0.02286405, 1.3825084 , -0.7662748 , -0.44735536, 0.02228237, + 0.13380653, 0. , 0. ], dtype=float32), a=np.int64(0), r=np.float64(-0.9934895324159925)), + Transition(s=array([-0.0304594 , 1.3718452 , -0.7662946 , -0.4740309 , 0.02897082, + 0.13378178, 0. , 0. ], dtype=float32), a=np.int64(2), r=np.float64(1.4450091994476508)), + Transition(s=array([-0.03802614, 1.361714 , -0.7636849 , -0.45042533, 0.03589968, + 0.1385901 , 0. , 0. ], dtype=float32), a=np.int64(2), r=np.float64(0.43907361933223116))]

Can we view the dataset of trajectories as a “labelled dataset” in order to apply supervised learning to approximate the optimal Q-function? Yes! Recall that we can characterize the optimal Q-function using the Bellman optimality equations, -which don’t depend on an actual policy:

Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\hi^\star(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [\max_{a'} Q_{\hi+1}^\star(s', a')]

We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep h\hi -- -as the inputs xx, and the r.h.s. of the above equation as the label f(x)f(x). Note that the r.h.s. can also be expressed as a conditional expectation:

f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \E [y \mid x] \quad \text{where} \quad y = r(s_\hi, a_\hi) + \max_{a'} Q^\star_{\hi + 1}(s', a').

Approximating the conditional expectation is precisely the task that Section 5.2 is suited for!

Our above dataset would give us NHN \cdot \hor samples in the dataset:

xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \hi} = (s_\hi^i, a_\hi^i, \hi) \qquad y_{i \hi} = r(s_\hi^i, a_\hi^i) + \max_{a'} Q^\star_{\hi + 1}(s_{\hi + 1}^i, a')
def get_X(trajectories: list[Trajectory]):
+which don’t depend on an actual policy:

Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\hi^\star(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [\max_{a'} Q_{\hi+1}^\star(s', a')]

We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep h\hi -- +as the inputs xx, and the r.h.s. of the above equation as the label f(x)f(x). Note that the r.h.s. can also be expressed as a conditional expectation:

f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \E [y \mid x] \quad \text{where} \quad y = r(s_\hi, a_\hi) + \max_{a'} Q^\star_{\hi + 1}(s', a').

Approximating the conditional expectation is precisely the task that Section 5.2 is suited for!

Our above dataset would give us NHN \cdot \hor samples in the dataset:

xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \hi} = (s_\hi^i, a_\hi^i, \hi) \qquad y_{i \hi} = r(s_\hi^i, a_\hi^i) + \max_{a'} Q^\star_{\hi + 1}(s_{\hi + 1}^i, a')
def get_X(trajectories: list[Trajectory]):
     """
     We pass the state and timestep as input to the Q-function
     and return an array of Q-values.
@@ -144,27 +144,27 @@
             Q_values = f(s, h + 1)
             y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))
         y.append(τ[-1].r)
-    return np.array(y)
s, a, h = get_X(trajectories[:1])
+    return np.array(y)
s, a, h = get_X(trajectories[:1])
 print("states:", s[:5])
 print("actions:", a[:5])
-print("timesteps:", h[:5])
states: [[-0.00767412  1.4020356  -0.77731264 -0.39489663  0.00889908  0.17607279
+print("timesteps:", h[:5])
states: [[-0.00767412  1.4020356  -0.77731264 -0.39489663  0.00889908  0.17607279
    0.          0.        ]
  [-0.01526899  1.392572   -0.766254   -0.42065707  0.01559265  0.13388489
    0.          0.        ]
- [-0.02275753  1.3831123  -0.75616544 -0.42051664  0.02282397  0.1446398
+ [-0.02286405  1.3825084  -0.7662748  -0.44735536  0.02228237  0.13380653
    0.          0.        ]
- [-0.0303195   1.3730422  -0.76537645 -0.4477334   0.03190061  0.18154952
+ [-0.0304594   1.3718452  -0.7662946  -0.4740309   0.02897082  0.13378178
    0.          0.        ]
- [-0.03779774  1.3623788  -0.7548636  -0.4740972   0.03885893  0.13917959
+ [-0.03802614  1.361714   -0.7636849  -0.45042533  0.03589968  0.1385901
    0.          0.        ]]
-actions: [3 2 1 3 2]
+actions: [3 0 0 2 2]
 timesteps: [0 1 2 3 4]
-
get_y(trajectories[:1])[:5]
Array([ 0.01510799, 0.80230474, -2.0942078 , -0.14640436, 0.4858344 ], dtype=float32)

Then we can use empirical risk minimization to find a function f^\hat f that approximates the optimal Q-function.

# We will see some examples of fitting methods in the next section
-FittingMethod = Callable[[Float[Array, "N D"], Float[Array, " N"]], QFunction]

But notice that the definition of yihy_{i \hi} depends on the Q-function itself! +

get_y(trajectories[:1])[:5]
Array([ 0.01510799, -0.9906127 , -0.9934895 , 1.4450092 , 0.43907362], dtype=float32)

Then we can use empirical risk minimization to find a function f^\hat f that approximates the optimal Q-function.

# We will see some examples of fitting methods in the next section
+FittingMethod = Callable[[Float[Array, "N D"], Float[Array, " N"]], QFunction]

But notice that the definition of yihy_{i \hi} depends on the Q-function itself! How can we resolve this circular dependency? Recall that we faced the same issue when evaluating a policy in an infinite-horizon MDP. There, we iterated the Definition 1.8 since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator. We can apply the same strategy here, using the f^\hat f from the previous iteration to compute the labels yihy_{i \hi}, -and then using this new dataset to fit the next iterate.

def fitted_q_iteration(
+and then using this new dataset to fit the next iterate.

def fitted_q_iteration(
     trajectories: list[Trajectory],
     fit: FittingMethod,
     epochs: int,
@@ -179,7 +179,7 @@
     for _ in range(epochs):
         y = get_y(trajectories, Q_hat)
         Q_hat = fit(X, y)
-    return Q_hat

We can also use this fixed-point interation to evaluate a policy using the dataset (not necessarily the one used to generate the trajectories):

def fitted_evaluation(
+    return Q_hat

We can also use this fixed-point interation to evaluate a policy using the dataset (not necessarily the one used to generate the trajectories):

def fitted_evaluation(
     trajectories: list[Trajectory],
     fit: FittingMethod,
     π: Policy,
@@ -195,8 +195,8 @@
     for _ in tqdm(range(epochs)):
         y = get_y(trajectories, Q_hat, π)
         Q_hat = fit(X, y)
-    return Q_hat

We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm <policy_iteration>) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative fitted_evaluation algorithm.

def fitted_policy_iteration(
+    return Q_hat

We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm <policy_iteration>) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative fitted_evaluation algorithm.

def fitted_policy_iteration(
     trajectories: list[Trajectory],
     fit: FittingMethod,
     epochs: int,
@@ -208,9 +208,9 @@
     for _ in range(epochs):
         Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)
         π = q_to_greedy(Q_hat)
-    return π

5.4Summary

\ No newline at end of file diff --git a/fitted-dp.json b/fitted-dp.json index 3abd61e..5d1ec89 100644 --- a/fitted-dp.json +++ b/fitted-dp.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"383dbef4a54c4fa6d21d8262b47a43806b7de9e8cf0aded0d6e80d9e6efb981f","slug":"fitted-dp","location":"/fitted_dp.md","dependencies":[],"frontmatter":{"title":"5 Fitted Dynamic Programming Algorithms","numbering":{"all":{"enabled":true},"enumerator":{"template":"5.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"fitted_dp.md","url":"/build/fitted_dp-bbfcf7e66c9311fe5ec9f9beb0cc0cbc.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"wkGHwuDkSC"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"5.1","key":"aGCxx73yJA"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"We borrow these definitions from the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"YBY89prETy"},{"type":"link","url":"/mdps","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"Zyfx21sdbh"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"GCjppAJBDc"},{"type":"text","value":" chapter:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"x5GRIRSwAI"}],"key":"ym8dR0MDAB"}],"key":"dCjPoljb5x"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from typing import NamedTuple, Callable, Optional\nfrom jaxtyping import Float, Array\nimport jax.numpy as np\nfrom jax import grad, vmap\nimport jax.random as rand\nfrom tqdm import tqdm\nimport gymnasium as gym\n\nkey = rand.PRNGKey(184)\n\n\nclass Transition(NamedTuple):\n s: int\n a: int\n r: float\n\n\nTrajectory = list[Transition]\n\n\ndef get_num_actions(trajectories: list[Trajectory]) -> int:\n \"\"\"Get the number of actions in the dataset. Assumes actions range from 0 to A-1.\"\"\"\n return max(max(t.a for t in τ) for τ in trajectories) + 1\n\n\nState = Float[Array, \"...\"] # arbitrary shape\n\n# assume finite `A` actions and f outputs an array of Q-values\n# i.e. Q(s, a, h) is implemented as f(s, h)[a]\nQFunction = Callable[[State, int], Float[Array, \" A\"]]\n\n\ndef Q_zero(A: int) -> QFunction:\n \"\"\"A Q-function that always returns zero.\"\"\"\n return lambda s, a: np.zeros(A)\n\n\n# a deterministic time-dependent policy\nPolicy = Callable[[State, int], int]\n\n\ndef q_to_greedy(Q: QFunction) -> Policy:\n \"\"\"Get the greedy policy for the given state-action value function.\"\"\"\n return lambda s, h: np.argmax(Q(s, h))","visibility":"hide","key":"MZYWhg7Fg5"},{"type":"output","id":"dIwwDto93BtGhcsGZZwCV","data":[],"visibility":"show","key":"HMneGE9DLG"}],"data":{"tags":[]},"visibility":"show","key":"nm9HdpvL9S"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"xvmhL8jX3b"},{"type":"link","url":"/mdps","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"pwKFaxGSIx"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"BDxRKwk05f"},{"type":"text","value":" chapter discussed the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"Yf1hSYC8qs"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"kF4Rlbbo5m"}],"key":"Hz5VMloJC8"},{"type":"text","value":" MDPs, where the state and action spaces ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"dlgKVCyfuI"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"S\\mathcal{S}S","key":"IRU1IxK5vs"},{"type":"text","value":" and ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"ogX01xksfi"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"A\\mathcal{A}A","key":"irXgbdLs77"},{"type":"text","value":" were finite.\nThis gave us a closed-form expression for computing the r.h.s. of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"JF3mCjOyob"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"the Bellman one-step consistency equation","key":"gR1w6Az8mm"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"bRCg8wrOYj"},{"type":"text","value":".\nIn this chapter, we consider the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"BRoia5Cq5F"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"large","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"qmkHcoAR1s"}],"key":"XlFlyvGWjH"},{"type":"text","value":" or ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"jSy0iXOMzb"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"vP7hdKDN9b"}],"key":"UuVwtJ76De"},{"type":"text","value":" state spaces, where the state space is too large to be enumerated.\nIn this case, we need to ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"X4kzRCtRvA"},{"type":"emphasis","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"ebLOR7j20R"}],"key":"mvG3SkgJoz"},{"type":"text","value":" the value function and Q-function using methods from ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"nDVGMukJ7I"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"ZZbJ2slJeT"}],"key":"DHwtnwKT7v"},{"type":"text","value":".","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"oFC9auR93y"}],"key":"kKPIIlVW3M"},{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"text","value":"We will first take a quick detour to introduce the ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"AejUq86u2L"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"APu99IK3iR"}],"key":"WGsiudZmAH"},{"type":"text","value":" framework for function approximation.\nWe will then see its application to ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"XarcWKC8Qr"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"fitted","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"nuxDmX5aAJ"}],"key":"xekzffCPuU"},{"type":"text","value":" RL algorithms,\nwhich attempt to learn the optimal value function (and the optimal policy) from a dataset of trajectories.","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"CvyZrgRMFx"}],"key":"F9WSSn03HC"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"ZQsZqm9oSH"}],"label":"erm","identifier":"erm","html_id":"erm","enumerator":"5.2","key":"aHzICAUiTR"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"hPUKHCXj7p"},{"type":"strong","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"UGwE2uzrZN"}],"key":"rbzEEleBnt"},{"type":"text","value":" task is as follows:\nWe seek to learn the relationship between some input variables ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"hzRaINwllH"},{"type":"inlineMath","value":"x","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"xxx","key":"FmuGm8FyiW"},{"type":"text","value":" and some output variable ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"n2E3gFgu2y"},{"type":"inlineMath","value":"y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"yyy","key":"cyviAJ4LSI"},{"type":"text","value":"\n(drawn from their joint distribution).\nPrecisely, we want to find a function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"R1WvS1Cxcz"},{"type":"inlineMath","value":"\\hat f : x \\mapsto y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"f^:xy\\hat f : x \\mapsto yf^:xy","key":"ypbKKYZDYB"},{"type":"text","value":" that minimizes the\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"YgBVX8Ihw6"},{"type":"emphasis","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"squared error","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"kYtDyHZNVN"}],"key":"J2u5tZy0Uy"},{"type":"text","value":" of the prediction:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"fAkZOlRjpS"}],"key":"xfzdqln6X0"},{"type":"math","value":"\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"f^=argminfE[(yf(x))2]\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]f^=argfminE[(yf(x))2]","enumerator":"5.1","key":"LZsHQGQNGt"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"An equivalent framing is that we seek to approximate the ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"dXh77vlgZW"},{"type":"emphasis","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"qJ3Nw6YM5v"}],"key":"YCQ5yQripq"},{"type":"text","value":" of ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"kEd7ZE6H9u"},{"type":"inlineMath","value":"y","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"yyy","key":"UpSa5tf6DA"},{"type":"text","value":" given ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"ekjrwWcbpg"},{"type":"inlineMath","value":"x","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"xxx","key":"hdWa01Aea9"},{"type":"text","value":":","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"atVFulWuER"}],"key":"yNiydguawg"},{"type":"proof","kind":"theorem","label":"conditional_expectation_minimizes_mse","identifier":"conditional_expectation_minimizes_mse","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Conditional expectation minimizes mean squared error","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"MrWNU9au68"}],"key":"BbZSl2KYzs"},{"type":"math","value":"\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])","position":{"start":{"line":98,"column":1},"end":{"line":100,"column":1}},"html":"argminfE[(yf(x))2]=(xE[yx])\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])argfminE[(yf(x))2]=(xE[yx])","enumerator":"5.2","key":"CGzQoPpL8p"}],"enumerator":"5.1","html_id":"conditional-expectation-minimizes-mse","key":"x2px4sBRNG"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"We can decompose the mean squared error as","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"som6JnwasG"}],"key":"H8BFc5cgm2"},{"type":"math","value":"\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}","position":{"start":{"line":106,"column":1},"end":{"line":111,"column":1}},"html":"E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]","enumerator":"5.3","key":"PjCtoPdyMi"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"knJcWSnKjQ"}],"key":"beyWNzawUf"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"Use the law of iterated expectations to show that the last term is zero.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"LW9IrNYAgw"}],"key":"oVhd7d9rgb"}],"key":"dd0bP9ipZn"},{"type":"paragraph","position":{"start":{"line":117,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"The first term is the irreducible error, and the second term is the error due to the approximation,\nwhich is minimized at ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"bXWIF94s5e"},{"type":"text","value":"0","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"nU2z9w1IXb"},{"type":"text","value":" when ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"rQ6EzWqKDt"},{"type":"inlineMath","value":"f(x) = \\E[y \\mid x]","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"html":"f(x)=E[yx]f(x) = \\E[y \\mid x]f(x)=E[yx]","key":"nuwbWYomV7"},{"type":"text","value":".","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"UKlM8tUJzq"}],"key":"h5sQmw1btL"}],"enumerator":"5.1","key":"z9oQQoChyx"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"In most applications, the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"mFaO79jOum"},{"type":"inlineMath","value":"x, y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"x,yx, yx,y","key":"kySFVPpQrT"},{"type":"text","value":" is unknown or extremely complex, and so we can’t\nanalytically evaluate ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"TrvGAKmsa4"},{"type":"inlineMath","value":"\\E [y \\mid x]","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"E[yx]\\E [y \\mid x]E[yx]","key":"w0DCwAXf8D"},{"type":"text","value":".\nInstead, our strategy is to draw ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"WURq5XBDKF"},{"type":"inlineMath","value":"N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"NNN","key":"D2M4tS0JxV"},{"type":"text","value":" samples ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"OC9zxC27NK"},{"type":"inlineMath","value":"(x_i, y_i)","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"(xi,yi)(x_i, y_i)(xi,yi)","key":"oqCjAKcUZ2"},{"type":"text","value":" from the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Xq5KCsPEMC"},{"type":"inlineMath","value":"x","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"xxx","key":"nLMMV0HQEN"},{"type":"text","value":" and ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"jrhL6VnCkJ"},{"type":"inlineMath","value":"y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"yyy","key":"TOg1WIPo8Z"},{"type":"text","value":",\nand then use the ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"GNfOnVlwXH"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"sample average","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"JuOE5bBNLr"}],"key":"OLnJtKITwd"},{"type":"text","value":" ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"NEIMvNgOAO"},{"type":"inlineMath","value":"\\sum_{i=1}^N (y_i - f(x_i))^2 / N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"i=1N(yif(xi))2/N\\sum_{i=1}^N (y_i - f(x_i))^2 / Ni=1N(yif(xi))2/N","key":"NWK5ehEidJ"},{"type":"text","value":" to approximate the mean squared error.\nThen we use a ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"peUPX1Wbah"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"MIQ6sc77p9"}],"key":"Qa7p3k1FqZ"},{"type":"text","value":" to find a function ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"KCcwbM3apV"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"f^\\hat ff^","key":"prTzB9tji4"},{"type":"text","value":" that minimizes this objective\nand thus approximates the conditional expectation.\nThis approach is called ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"amFDnMoK4e"},{"type":"strong","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"rWA1T3xMZH"}],"key":"bxjVwovnMx"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"buV6FZMMdC"}],"key":"HBtlPCwf2n"},{"type":"proof","kind":"definition","label":"empirical_risk_minimization","identifier":"empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"EFKnQokOkI"}],"key":"MdH8CRgZlm"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"QlrJddtYZE"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"fM1ZT0cLRF"},{"type":"text","value":", empirical risk minimization seeks to find a function ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"ZjM5Fyrxmh"},{"type":"inlineMath","value":"f","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"fff","key":"iVsg7yIa22"},{"type":"text","value":" (from some class of functions ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"Yxex5xJoOT"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"F\\mathcal{F}F","key":"XInsNGP5nS"},{"type":"text","value":") that minimizes the empirical risk:","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"sdUNv8ITqs"}],"key":"zzQpIFEkk9"},{"type":"math","value":"\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2","position":{"start":{"line":134,"column":1},"end":{"line":136,"column":1}},"html":"f^=argminfF1Ni=1N(yif(xi))2\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2f^=argfFminN1i=1N(yif(xi))2","enumerator":"5.4","key":"xN3gnw2wKs"},{"type":"paragraph","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"We will cover the details of the minimization process in [](#the next section ).","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"GYzSOQlTxn"}],"key":"lcBNz7w9Gj"}],"enumerator":"5.1","html_id":"empirical-risk-minimization","key":"CjZ7RMjYNR"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"pnS9Xja0bo"}],"key":"eExWj2mbwd"},{"type":"paragraph","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"Why is it important that we constrain our search to a class of functions ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"q1y5UMafcX"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"html":"F\\mathcal{F}F","key":"IVoy8jqVUw"},{"type":"text","value":"?","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"iiZ0eXZlUw"}],"key":"MR4AonSzga"},{"type":"paragraph","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Hint: Consider the function ","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"CDVd11WF6q"},{"type":"inlineMath","value":"f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"html":"f(x)=i=1Nyi1{x=xi}f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}f(x)=i=1Nyi1{x=xi}","key":"Gm5auqPZz8"},{"type":"text","value":". What is the empirical risk of this function? Would you consider it a good approximation of the conditional expectation?","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"TqY6tZWWL4"}],"key":"p7lc92y3zE"}],"key":"CNUAHasiJO"},{"type":"heading","depth":2,"position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"Fitted value iteration","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"ASlbwibq9H"}],"identifier":"fitted-value-iteration","label":"Fitted value iteration","html_id":"fitted-value-iteration","implicit":true,"enumerator":"5.3","key":"uk9jChRT6Z"},{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"Let us apply ERM to the RL problem of computing the optimal policy / value function.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"N9FwHdKlEz"}],"key":"INcW7wsW8q"},{"type":"paragraph","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"How did we compute the optimal value function in MDPs with ","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"Z5dKrRPt2n"},{"type":"emphasis","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"QHTPaCqroB"}],"key":"rPJUU3nTJc"},{"type":"text","value":" state and action spaces?","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"itGBdQx21q"}],"key":"HYDXMifJ7A"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":153,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":153,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"In a [](#finite-horizon MDP ), we can use ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"Oq4jC8vpQ4"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"dynamic programming","key":"bCQan4Nrd5"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"e1J9Na2fG6"},{"type":"text","value":", working backwards from the end of the time horizon, to compute the optimal value function exactly.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"ADzrgYc0os"}],"key":"fASesoOze7"}],"key":"ZJpnBPKUcu"},{"type":"listItem","spread":true,"position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"In an [](#infinite-horizon MDP ), we can use [](#value iteration ), which iterates the Bellman optimality operator ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"YOcmLv7k5f"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"DSWhdXEKMS"},{"type":"text","value":"1.54","key":"acXbFzFjUb"},{"type":"text","value":")","key":"vrxdtggYgU"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"jnl7WOS15Y"},{"type":"text","value":" to approximately compute the optimal value function.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"OcWUIqLEw6"}],"key":"SpgFB6VtON"}],"key":"MdrZCSuSsi"}],"key":"ifoGOP230I"},{"type":"paragraph","position":{"start":{"line":157,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"Our existing approaches represent the value function, and the MDP itself,\nin matrix notation.\nBut what happens if the state space is extremely large, or even infinite (e.g. real-valued)?\nThen computing a weighted sum over all possible next states, which is required to compute the Bellman operator,\nbecomes intractable.","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"h8b6wWOIC3"}],"key":"sUPI2g7mbl"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Instead, we will need to use ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"uFpDrG0zQI"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"function approximation","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"MUVCrYxsVD"}],"key":"MC0OqJk7il"},{"type":"text","value":" methods from supervised learning to solve for the value function in an alternative way.","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"Ak2CKVFXe7"}],"key":"HAbIGrHkks"},{"type":"paragraph","position":{"start":{"line":165,"column":1},"end":{"line":166,"column":1}},"children":[{"type":"text","value":"In particular, suppose we have a dataset of ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"l9WMDRrhQE"},{"type":"inlineMath","value":"N","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"NNN","key":"eh0GkwFURO"},{"type":"text","value":" trajectories ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"uUDShpZbD9"},{"type":"inlineMath","value":"\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"τ1,,τNρπ\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}τ1,,τNρπ","key":"t5LA2XJQ5h"},{"type":"text","value":" from some policy ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"U2QGFD2kPE"},{"type":"text","value":"π","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"hWJAJuX250"},{"type":"text","value":" (called the ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"kjPbUVLqWF"},{"type":"strong","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"text","value":"data collection policy","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"JBQfL2rkJK"}],"key":"aPhWZPxwqc"},{"type":"text","value":") acting in the MDP of interest.\nLet us indicate the trajectory index in the superscript, so that","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"HSqMfxGhJM"}],"key":"kFp4OyLMJt"},{"type":"math","value":"\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.","position":{"start":{"line":168,"column":1},"end":{"line":170,"column":1}},"html":"τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.","enumerator":"5.5","key":"azj1ts2GAE"}],"key":"tdrpkzHmnw"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def collect_data(\n env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None\n) -> list[Trajectory]:\n \"\"\"Collect a dataset of trajectories from the given policy (or a random one).\"\"\"\n trajectories = []\n seeds = [rand.bits(k).item() for k in rand.split(key, N)]\n for i in tqdm(range(N)):\n τ = []\n s, _ = env.reset(seed=seeds[i])\n for h in range(H):\n # sample from a random policy\n a = π(s, h) if π else env.action_space.sample()\n s_next, r, terminated, truncated, _ = env.step(a)\n τ.append(Transition(s, a, r))\n if terminated or truncated:\n break\n s = s_next\n trajectories.append(τ)\n return trajectories","key":"p709NhQyi2"},{"type":"output","id":"c81jXZ1lsVmvilha8D3QD","data":[],"key":"qyXVyLQ1EZ"}],"data":{},"key":"m2wKYbUvta"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"env = gym.make(\"LunarLander-v2\")\ntrajectories = collect_data(env, 100, 300, key)\ntrajectories[0][:5] # show first five transitions from first trajectory","key":"HVW4cTkRu8"},{"type":"output","id":"W2B5_Zku3HEAnvMo2Eryz","data":[{"output_type":"stream","name":"stderr","text":"\r 0%| | 0/100 [00:00Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\\hi^\\star(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [\\max_{a'} Q_{\\hi+1}^\\star(s', a')]Qh(s,a)=r(s,a)+EsP(s,a)[amaxQh+1(s,a)]","enumerator":"5.6","key":"ypFQnqD6KA"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"tgMekUMEYn"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"h\\hih","key":"AS8XBwq7JT"},{"type":"text","value":" --\nas the inputs ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"tCpeRwFCMf"},{"type":"inlineMath","value":"x","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"xxx","key":"qncD113YLJ"},{"type":"text","value":", and the r.h.s. of the above equation as the label ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"lPlW8Kt69t"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"f(x)f(x)f(x)","key":"KDiaDMHG0W"},{"type":"text","value":". Note that the r.h.s. can also be expressed as a ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"XLQEfaVDJH"},{"type":"strong","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"a7ajF5BUh8"}],"key":"YgwU2amkSK"},{"type":"text","value":":","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"KfPUh18r5X"}],"key":"Z5vWGOJ6SO"},{"type":"math","value":"f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').","position":{"start":{"line":211,"column":1},"end":{"line":213,"column":1}},"html":"f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').f(x)=E[yx]wherey=r(sh,ah)+amaxQh+1(s,a).","enumerator":"5.7","key":"I5LRsrhI29"},{"type":"paragraph","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Approximating the conditional expectation is precisely the task that ","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"qnnqbatvg8"},{"type":"crossReference","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Section ","key":"kKlg4naOKl"},{"type":"text","value":"5.2","key":"Ovz7j1qA32"}],"identifier":"erm","label":"erm","kind":"heading","template":"Section %s","enumerator":"5.2","resolved":true,"html_id":"erm","key":"ATZu8KvBbJ"},{"type":"text","value":" is suited for!","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"OcTIYZJEvX"}],"key":"T81D03vr8z"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"Our above dataset would give us ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"pe9EexwKog"},{"type":"inlineMath","value":"N \\cdot \\hor","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"NHN \\cdot \\horNH","key":"yWnLlGdJdK"},{"type":"text","value":" samples in the dataset:","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"Xw40rChu3M"}],"key":"hpKbSmtqa4"},{"type":"math","value":"x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')","position":{"start":{"line":219,"column":1},"end":{"line":221,"column":1}},"html":"xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')xih=(shi,ahi,h)yih=r(shi,ahi)+amaxQh+1(sh+1i,a)","enumerator":"5.8","key":"ObTVBwFLmm"}],"key":"HzCRSgMoCO"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def get_X(trajectories: list[Trajectory]):\n \"\"\"\n We pass the state and timestep as input to the Q-function\n and return an array of Q-values.\n \"\"\"\n rows = [(τ[h].s, τ[h].a, h) for τ in trajectories for h in range(len(τ))]\n return [np.stack(ary) for ary in zip(*rows)]\n\n\ndef get_y(\n trajectories: list[Trajectory],\n f: Optional[QFunction] = None,\n π: Optional[Policy] = None,\n):\n \"\"\"\n Transform the dataset of trajectories into a dataset for supervised learning.\n If `π` is None, instead estimates the optimal Q function.\n Otherwise, estimates the Q function of π.\n \"\"\"\n f = f or Q_zero(get_num_actions(trajectories))\n y = []\n for τ in trajectories:\n for h in range(len(τ) - 1):\n s, a, r = τ[h]\n Q_values = f(s, h + 1)\n y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))\n y.append(τ[-1].r)\n return np.array(y)","key":"LwJocZkJ7U"},{"type":"output","id":"qS7axaVMYHOpIklZ3NOBO","data":[],"key":"FacW5aGKUH"}],"data":{},"key":"cyS2LZC7Kv"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"s, a, h = get_X(trajectories[:1])\nprint(\"states:\", s[:5])\nprint(\"actions:\", a[:5])\nprint(\"timesteps:\", h[:5])","key":"p5W7Hbm10P"},{"type":"output","id":"h1zIGmPve-YlrG3QePZiq","data":[{"output_type":"stream","name":"stdout","text":"states: [[-0.00767412 1.4020356 -0.77731264 -0.39489663 0.00889908 0.17607279\n 0. 0. ]\n [-0.01526899 1.392572 -0.766254 -0.42065707 0.01559265 0.13388489\n 0. 0. ]\n [-0.02275753 1.3831123 -0.75616544 -0.42051664 0.02282397 0.1446398\n 0. 0. ]\n [-0.0303195 1.3730422 -0.76537645 -0.4477334 0.03190061 0.18154952\n 0. 0. ]\n [-0.03779774 1.3623788 -0.7548636 -0.4740972 0.03885893 0.13917959\n 0. 0. ]]\nactions: [3 2 1 3 2]\ntimesteps: [0 1 2 3 4]\n"}],"key":"PktzLiB2wR"}],"data":{},"key":"ABEMLHpl2g"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"get_y(trajectories[:1])[:5]","key":"TfkNUSGwzJ"},{"type":"output","id":"8m1fT8k1FYLTPr98WTNwI","data":[{"output_type":"execute_result","execution_count":6,"metadata":{},"data":{"text/plain":{"content":"Array([ 0.01510799, 0.80230474, -2.0942078 , -0.14640436, 0.4858344 ], dtype=float32)","content_type":"text/plain"}}}],"key":"BKJE80DiV7"}],"data":{},"key":"N7S8kdbEfm"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"Then we can use empirical risk minimization to find a function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"GdNNVSWLF8"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"f^\\hat ff^","key":"GLVnh2oNk5"},{"type":"text","value":" that approximates the optimal Q-function.","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"nnNn1yPngY"}],"key":"wyISVUKZO5"}],"key":"vBsreceyle"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# We will see some examples of fitting methods in the next section\nFittingMethod = Callable[[Float[Array, \"N D\"], Float[Array, \" N\"]], QFunction]","key":"tr8ekPl98J"},{"type":"output","id":"kMsqnpnx2w6qNAsyD_F2c","data":[],"key":"BleozvPTLd"}],"data":{},"key":"pEVb3rX45d"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":272,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"But notice that the definition of ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"I1NDqBlxdg"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"MCYMhr899p"},{"type":"text","value":" depends on the Q-function itself!\nHow can we resolve this circular dependency?\nRecall that we faced the same issue ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"Gk74QRbE7e"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"when evaluating a policy in an infinite-horizon MDP","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"tJ1AZrNiXg"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"W07RmGZMHC"},{"type":"text","value":". There, we iterated the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"giFiphnTFp"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"Definition ","key":"uca4shuyxG"},{"type":"text","value":"1.8","key":"XANwKTov5q"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"siWFgeufoN"},{"type":"text","value":" since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator.\nWe can apply the same strategy here, using the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"gVKBwh6DaZ"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"f^\\hat ff^","key":"JdRNRrv6XL"},{"type":"text","value":" from the previous iteration to compute the labels ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"TMjbU9UywL"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"by70aedQyJ"},{"type":"text","value":",\nand then using this new dataset to fit the next iterate.","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"QxhWiZnzkg"}],"key":"t90JQF4B04"},{"type":"proof","kind":"definition","label":"fitted_q_iteration","identifier":"fitted_q_iteration","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted Q-function iteration","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"UWFXnHyb0i"}],"key":"E3aTGEmIXJ"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":281,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"ZwFflDw5wv"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"F2A1DWiPck"},{"type":"text","value":".","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"HdsScKJ4Bp"}],"key":"kelNUERpSA"},{"type":"listItem","spread":true,"position":{"start":{"line":282,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"bvJRQDmnlz"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":283,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"xdTa4E9S2d"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"X,yX, yX,y","key":"TWrt7HX5Yb"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"X5f764E7wp"},{"type":"inlineMath","value":"f","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"fff","key":"hfFS3C81P5"},{"type":"text","value":", where the labels come from the r.h.s. of the Bellman optimality operator ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"FGh2wWtOPv"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"FRvAVACjyQ"},{"type":"text","value":"1.54","key":"jbjqIpD74A"},{"type":"text","value":")","key":"KFX785N2Dn"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"jyPQ0mmdsP"}],"key":"E6NifrbUn2"}],"key":"zdBpUv28Um"},{"type":"listItem","spread":true,"position":{"start":{"line":284,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"HkXbrhP90x"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"f^\\hat ff^","key":"CWr5Pz0Re9"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"bUvBJbsBVG"}],"key":"aNr3lytDHv"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":286,"column":1},"end":{"line":286,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.9","key":"zBk5BGFPgy"}],"key":"I4efoD2Ohe"}],"key":"PtiMasobsa"}],"key":"PooVSrBkBy"}],"key":"J4HwebmjUv"}],"enumerator":"5.2","html_id":"fitted-q-iteration","key":"ZNt7pqcjve"}],"key":"BlychEJIQh"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_q_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted Q-function iteration using the given dataset.\n Returns an estimate of the optimal Q-function.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in range(epochs):\n y = get_y(trajectories, Q_hat)\n Q_hat = fit(X, y)\n return Q_hat","key":"XOFbIxMDS4"},{"type":"output","id":"UFtYchg40arE6kZptvKGT","data":[],"key":"bhkW5Q75wY"}],"data":{},"key":"uxfPrjPOQv"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"We can also use this fixed-point interation to ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"of395DsX5T"},{"type":"emphasis","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"EutZ9YDPRt"}],"key":"zB8r7wmY3y"},{"type":"text","value":" a policy using the dataset (not necessarily the one used to generate the trajectories):","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"lBIhYxk5oZ"}],"key":"v41ES11nGO"},{"type":"proof","kind":"definition","label":"fitted_evaluation","identifier":"fitted_evaluation","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted policy evaluation","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"M9EZBxoRNh"}],"key":"u9n5cs2aMQ"},{"type":"paragraph","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"children":[{"type":"strong","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"Xy68xAlcHp"}],"key":"OQy9es8x2q"},{"type":"text","value":" Policy ","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"cExUtCgMUR"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"html":"π:S×[H]Δ(A)\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})π:S×[H]Δ(A)","key":"vH3I91Z26U"},{"type":"text","value":" to be evaluated.","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"pc8jDhiKZe"}],"key":"ATgl0udg8Z"},{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"strong","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"Output:","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"xrR3jj5tVs"}],"key":"GXnJao3WqM"},{"type":"text","value":" An approximation of the value function ","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"vUhUNB7rRj"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"html":"QπQ^\\piQπ","key":"xJVIZWEk4A"},{"type":"text","value":" of the policy.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"MZUPwRYXw2"}],"key":"FnwIo1x0GL"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":317,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"key":"VlJRmGMdP1"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"t3HtaPyBg2"},{"type":"text","value":".","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"key":"Py87EYzIAY"}],"key":"nTR3gSZdiA"},{"type":"listItem","spread":true,"position":{"start":{"line":318,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"dJHWyLwiRA"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":319,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"POFpBcevCn"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"X,yX, yX,y","key":"v6YnCmVOHC"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"q0txSTuYFz"},{"type":"inlineMath","value":"f","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"fff","key":"lcR19ZiOne"},{"type":"text","value":", where the labels come from the r.h.s. of the ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"VqQ7Su69b4"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Bellman consistency equation","key":"ii7W35VVQA"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"Xfcjnhnu3x"},{"type":"text","value":" for the given policy.","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"qhecB1WlMm"}],"key":"OZyByY7GvO"}],"key":"DLxhB9qith"},{"type":"listItem","spread":true,"position":{"start":{"line":320,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"JNCkkgJG7Z"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"f^\\hat ff^","key":"YdSjYZFdf3"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"b5H3BAZI7w"}],"key":"wsxwLLC4qt"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.10","key":"xSnX2ebgA3"}],"key":"lW6TwHKLuP"}],"key":"daYNKuUX1e"}],"key":"KgeQldW5oZ"}],"key":"SINN3TYWZU"}],"enumerator":"5.3","html_id":"fitted-evaluation","key":"OhlvTcedn7"}],"key":"DfMieLjmJl"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_evaluation(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n π: Policy,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted policy evaluation using the given dataset.\n Returns an estimate of the Q-function of the given policy.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in tqdm(range(epochs)):\n y = get_y(trajectories, Q_hat, π)\n Q_hat = fit(X, y)\n return Q_hat","key":"Rfr7ajed4I"},{"type":"output","id":"AL54jrZ_X4sDkYA5dkU7t","data":[],"key":"A6R92s6xtP"}],"data":{},"key":"b2PtqZVnk4"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"HQSnYvy2Tn"}],"key":"YMUvbG954N"},{"type":"paragraph","position":{"start":{"line":346,"column":1},"end":{"line":347,"column":1}},"children":[{"type":"text","value":"Spot the difference between ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"fS65e16WiP"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"hjobjCv3Jm"},{"type":"text","value":" and ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"lVpFd2lNnS"},{"type":"inlineCode","value":"fitted_q_iteration","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"XcdS1SbW32"},{"type":"text","value":". (See the definition of ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"GjLRt06ISV"},{"type":"inlineCode","value":"get_y","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"qLDpn273NB"},{"type":"text","value":".)\nHow would you modify this algorithm to evaluate the data collection policy?","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"lRJhqyVxlI"}],"key":"xuvksTUjwp"}],"key":"UsnztdmULh"},{"type":"paragraph","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"text","value":"We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm ) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative ","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"yjtNajrE6F"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"GkKnp2aeFe"},{"type":"text","value":" algorithm.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"vAUNGjaOmi"}],"key":"JTPiQFNNyU"}],"key":"xHD2jv10mu"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_policy_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n evaluation_epochs: int,\n π_init: Optional[Policy] = lambda s, h: 0, # constant zero policy\n):\n \"\"\"Run fitted policy iteration using the given dataset.\"\"\"\n π = π_init\n for _ in range(epochs):\n Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)\n π = q_to_greedy(Q_hat)\n return π","key":"ddVSSqhPyr"},{"type":"output","id":"advXJxitqmOD-HrsYlb3e","data":[],"key":"TehCZ3WD8H"}],"data":{},"key":"VdiRmHhYEW"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":368,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":368,"column":1},"end":{"line":368,"column":1}},"key":"jmsn9hRBQv"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"5.4","key":"dpWUOF3jHo"}],"key":"mbvMkNIsJX"}],"key":"mLZTaw5eGT"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"6 Policy Optimization","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"383dbef4a54c4fa6d21d8262b47a43806b7de9e8cf0aded0d6e80d9e6efb981f","slug":"fitted-dp","location":"/fitted_dp.md","dependencies":[],"frontmatter":{"title":"5 Fitted Dynamic Programming Algorithms","numbering":{"all":{"enabled":true},"enumerator":{"template":"5.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"fitted_dp.md","url":"/build/fitted_dp-bbfcf7e66c9311fe5ec9f9beb0cc0cbc.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"zS6OQ5PWTo"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"5.1","key":"yxmUeqWUjf"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"We borrow these definitions from the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"lhdaTGdEH8"},{"type":"link","url":"/mdps","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"tQXIWj8p8Z"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"vSjvrOEwSP"},{"type":"text","value":" chapter:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Y48vE6AHAe"}],"key":"SshPXwPWv7"}],"key":"g3CNyQJdcn"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from typing import NamedTuple, Callable, Optional\nfrom jaxtyping import Float, Array\nimport jax.numpy as np\nfrom jax import grad, vmap\nimport jax.random as rand\nfrom tqdm import tqdm\nimport gymnasium as gym\n\nkey = rand.PRNGKey(184)\n\n\nclass Transition(NamedTuple):\n s: int\n a: int\n r: float\n\n\nTrajectory = list[Transition]\n\n\ndef get_num_actions(trajectories: list[Trajectory]) -> int:\n \"\"\"Get the number of actions in the dataset. Assumes actions range from 0 to A-1.\"\"\"\n return max(max(t.a for t in τ) for τ in trajectories) + 1\n\n\nState = Float[Array, \"...\"] # arbitrary shape\n\n# assume finite `A` actions and f outputs an array of Q-values\n# i.e. Q(s, a, h) is implemented as f(s, h)[a]\nQFunction = Callable[[State, int], Float[Array, \" A\"]]\n\n\ndef Q_zero(A: int) -> QFunction:\n \"\"\"A Q-function that always returns zero.\"\"\"\n return lambda s, a: np.zeros(A)\n\n\n# a deterministic time-dependent policy\nPolicy = Callable[[State, int], int]\n\n\ndef q_to_greedy(Q: QFunction) -> Policy:\n \"\"\"Get the greedy policy for the given state-action value function.\"\"\"\n return lambda s, h: np.argmax(Q(s, h))","visibility":"hide","key":"GjvOVpl8dg"},{"type":"output","id":"8pUJXzCUF9ZcKRj1XtFv2","data":[],"visibility":"show","key":"DjH4Uqm2bQ"}],"data":{"tags":[]},"visibility":"show","key":"pbbVyVj8xd"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"YNGL1fjB8t"},{"type":"link","url":"/mdps","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"jQ1TSWaLci"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"uY3GKmznqW"},{"type":"text","value":" chapter discussed the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"qY41gWrYyX"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"mwtCHZ3379"}],"key":"Rx8lbuzkP4"},{"type":"text","value":" MDPs, where the state and action spaces ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"SHh6RFQSYy"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"S\\mathcal{S}S","key":"ixuDvTIN3d"},{"type":"text","value":" and ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"MmkBcQLnGP"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"A\\mathcal{A}A","key":"GYfwjCsaml"},{"type":"text","value":" were finite.\nThis gave us a closed-form expression for computing the r.h.s. of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"fYMTUwQd5z"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"the Bellman one-step consistency equation","key":"FUHolABSeT"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"P5965vlkx4"},{"type":"text","value":".\nIn this chapter, we consider the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"KKe4ZguNcZ"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"large","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"yYTUoesYKO"}],"key":"yvHDr9bH1w"},{"type":"text","value":" or ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"e0Tx1Dg3Me"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"jzSQI2w0ww"}],"key":"T5ftabGRfr"},{"type":"text","value":" state spaces, where the state space is too large to be enumerated.\nIn this case, we need to ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"FMZOPeLOt1"},{"type":"emphasis","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"SuB4qUe653"}],"key":"yJqBJjfgOI"},{"type":"text","value":" the value function and Q-function using methods from ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"aHlp3XnJx8"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"LdMIykysfy"}],"key":"ocTykntGMu"},{"type":"text","value":".","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"IHpEouyhrY"}],"key":"kNDF5gwkmH"},{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"text","value":"We will first take a quick detour to introduce the ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"lGtc7jEGrI"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"FgQk8hbmm3"}],"key":"dO0BLT0HY0"},{"type":"text","value":" framework for function approximation.\nWe will then see its application to ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"L9n4AUEZZn"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"fitted","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"tru3eTC2cK"}],"key":"lPySSChZ9i"},{"type":"text","value":" RL algorithms,\nwhich attempt to learn the optimal value function (and the optimal policy) from a dataset of trajectories.","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"irE08Ho0DB"}],"key":"KtLGc3YPXt"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"J4pJL4LPPG"}],"label":"erm","identifier":"erm","html_id":"erm","enumerator":"5.2","key":"daYxPZ4nzE"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"UePExnuC8i"},{"type":"strong","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"dic9phjyjf"}],"key":"jlCCH3iI25"},{"type":"text","value":" task is as follows:\nWe seek to learn the relationship between some input variables ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"UBLvSwRqoP"},{"type":"inlineMath","value":"x","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"xxx","key":"tNNbXkZxbq"},{"type":"text","value":" and some output variable ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"qgzYJVlMqW"},{"type":"inlineMath","value":"y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"yyy","key":"GWKAaLfRkP"},{"type":"text","value":"\n(drawn from their joint distribution).\nPrecisely, we want to find a function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"Gs3JjTaX0U"},{"type":"inlineMath","value":"\\hat f : x \\mapsto y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"f^:xy\\hat f : x \\mapsto yf^:xy","key":"m1PXOXHfMq"},{"type":"text","value":" that minimizes the\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"lTVfBsTOe8"},{"type":"emphasis","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"squared error","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"HohtYpgnLd"}],"key":"zNKkVww1Ui"},{"type":"text","value":" of the prediction:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"j1CmH3DjMo"}],"key":"RuCHW3YOts"},{"type":"math","value":"\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"f^=argminfE[(yf(x))2]\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]f^=argfminE[(yf(x))2]","enumerator":"5.1","key":"DCrVEZxLbx"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"An equivalent framing is that we seek to approximate the ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"YikV8mwDGP"},{"type":"emphasis","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"ttz09LMm6V"}],"key":"TvmRpv4EEw"},{"type":"text","value":" of ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"upvwC67cul"},{"type":"inlineMath","value":"y","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"yyy","key":"UEVjAdftck"},{"type":"text","value":" given ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"GoqXQju7Wd"},{"type":"inlineMath","value":"x","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"xxx","key":"v8VyCzK6Y0"},{"type":"text","value":":","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"gT8zoDLiWo"}],"key":"veRPme754J"},{"type":"proof","kind":"theorem","label":"conditional_expectation_minimizes_mse","identifier":"conditional_expectation_minimizes_mse","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Conditional expectation minimizes mean squared error","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"VoFIvseYSS"}],"key":"l88teB114J"},{"type":"math","value":"\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])","position":{"start":{"line":98,"column":1},"end":{"line":100,"column":1}},"html":"argminfE[(yf(x))2]=(xE[yx])\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])argfminE[(yf(x))2]=(xE[yx])","enumerator":"5.2","key":"ngPzQJpHW4"}],"enumerator":"5.1","html_id":"conditional-expectation-minimizes-mse","key":"XmtAV9JywA"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"We can decompose the mean squared error as","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"PiSvNtEXPi"}],"key":"aJtBS42fE7"},{"type":"math","value":"\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}","position":{"start":{"line":106,"column":1},"end":{"line":111,"column":1}},"html":"E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]","enumerator":"5.3","key":"ZYeOCugr19"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"ISp3ohiugU"}],"key":"AObrviuE38"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"Use the law of iterated expectations to show that the last term is zero.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"nHVVB9OA2S"}],"key":"ro9LiKi9Tc"}],"key":"P4k2N5m1Sy"},{"type":"paragraph","position":{"start":{"line":117,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"The first term is the irreducible error, and the second term is the error due to the approximation,\nwhich is minimized at ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"LYMfXMFPuV"},{"type":"text","value":"0","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"o0pvx5vkiV"},{"type":"text","value":" when ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"GbedGHYrtj"},{"type":"inlineMath","value":"f(x) = \\E[y \\mid x]","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"html":"f(x)=E[yx]f(x) = \\E[y \\mid x]f(x)=E[yx]","key":"tjspicOxKK"},{"type":"text","value":".","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"cthIAVYcoA"}],"key":"PLPsqAeRp4"}],"enumerator":"5.1","key":"LzmFsAokXx"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"In most applications, the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"lTs5AOPX7s"},{"type":"inlineMath","value":"x, y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"x,yx, yx,y","key":"bKNVqtWmvy"},{"type":"text","value":" is unknown or extremely complex, and so we can’t\nanalytically evaluate ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Tx7LqLomNj"},{"type":"inlineMath","value":"\\E [y \\mid x]","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"E[yx]\\E [y \\mid x]E[yx]","key":"QzHY1F73SB"},{"type":"text","value":".\nInstead, our strategy is to draw ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"lswburjOdQ"},{"type":"inlineMath","value":"N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"NNN","key":"BNps5ZlAYi"},{"type":"text","value":" samples ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"qnNl21LFEC"},{"type":"inlineMath","value":"(x_i, y_i)","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"(xi,yi)(x_i, y_i)(xi,yi)","key":"IY7QgQIjer"},{"type":"text","value":" from the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Vk6LEMHKGA"},{"type":"inlineMath","value":"x","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"xxx","key":"EsklIgjZNO"},{"type":"text","value":" and ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"iNYXmMvyeq"},{"type":"inlineMath","value":"y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"yyy","key":"HjuMxHmNWw"},{"type":"text","value":",\nand then use the ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"GHvkpJBK8V"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"sample average","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"XRhdHlNm4U"}],"key":"ZOI4XrExD5"},{"type":"text","value":" ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"eSxzjVVPw8"},{"type":"inlineMath","value":"\\sum_{i=1}^N (y_i - f(x_i))^2 / N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"i=1N(yif(xi))2/N\\sum_{i=1}^N (y_i - f(x_i))^2 / Ni=1N(yif(xi))2/N","key":"jWY4Madoh7"},{"type":"text","value":" to approximate the mean squared error.\nThen we use a ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"CFF11HEAUP"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"D7dwod2q9o"}],"key":"MsG1HFbXw6"},{"type":"text","value":" to find a function ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"L3Y2WhXtie"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"f^\\hat ff^","key":"OT9gUl5Hu6"},{"type":"text","value":" that minimizes this objective\nand thus approximates the conditional expectation.\nThis approach is called ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"KktbbE9cNC"},{"type":"strong","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"zNCAOb71gu"}],"key":"D6f771VvIN"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Dwg2XEKbRs"}],"key":"sCfCY0PSqX"},{"type":"proof","kind":"definition","label":"empirical_risk_minimization","identifier":"empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"QP7QLe41CY"}],"key":"QqWXWyEX6v"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"Ris3K8BoCm"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"Myax5yozfy"},{"type":"text","value":", empirical risk minimization seeks to find a function ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"udPhZa4FHh"},{"type":"inlineMath","value":"f","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"fff","key":"DNRVidgO35"},{"type":"text","value":" (from some class of functions ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"yxegKu6D0p"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"F\\mathcal{F}F","key":"sAlZdjkQqr"},{"type":"text","value":") that minimizes the empirical risk:","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"FoBafnPZTM"}],"key":"WAnQxDkwD6"},{"type":"math","value":"\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2","position":{"start":{"line":134,"column":1},"end":{"line":136,"column":1}},"html":"f^=argminfF1Ni=1N(yif(xi))2\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2f^=argfFminN1i=1N(yif(xi))2","enumerator":"5.4","key":"G2tWVQVy7a"},{"type":"paragraph","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"We will cover the details of the minimization process in [](#the next section ).","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"RkNFKFzt7J"}],"key":"N5g2QskjOX"}],"enumerator":"5.1","html_id":"empirical-risk-minimization","key":"IEw4BO9NRG"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"c9JHUdi4LO"}],"key":"XrzNtL0tIo"},{"type":"paragraph","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"Why is it important that we constrain our search to a class of functions ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"uOxcoy4cdG"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"html":"F\\mathcal{F}F","key":"oTESj6tfWk"},{"type":"text","value":"?","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"JqFT61tEMo"}],"key":"qSn9VEv1y5"},{"type":"paragraph","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Hint: Consider the function ","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"wg3ovtdw1N"},{"type":"inlineMath","value":"f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"html":"f(x)=i=1Nyi1{x=xi}f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}f(x)=i=1Nyi1{x=xi}","key":"QxDNMqvFb1"},{"type":"text","value":". What is the empirical risk of this function? Would you consider it a good approximation of the conditional expectation?","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"BxfHHGxdum"}],"key":"svxsepspFT"}],"key":"Ol6ogFaF7m"},{"type":"heading","depth":2,"position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"Fitted value iteration","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"OMiDYJgcHg"}],"identifier":"fitted-value-iteration","label":"Fitted value iteration","html_id":"fitted-value-iteration","implicit":true,"enumerator":"5.3","key":"DSdq4wTYlY"},{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"Let us apply ERM to the RL problem of computing the optimal policy / value function.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"OMSc6EC4F7"}],"key":"U7J2i7qSbO"},{"type":"paragraph","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"How did we compute the optimal value function in MDPs with ","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"hLN1EgA6Sf"},{"type":"emphasis","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"PoAx3gauBO"}],"key":"qYfUgvSmg9"},{"type":"text","value":" state and action spaces?","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"UqFxJznhjb"}],"key":"qljQk3gJem"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":153,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":153,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"In a [](#finite-horizon MDP ), we can use ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"ezwGmJEAQ9"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"dynamic programming","key":"yWZluXBV7B"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"hS4ZwIEeQb"},{"type":"text","value":", working backwards from the end of the time horizon, to compute the optimal value function exactly.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"w9Na2NUC6q"}],"key":"NEpyma5nUr"}],"key":"YWSVRL12xg"},{"type":"listItem","spread":true,"position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"In an [](#infinite-horizon MDP ), we can use [](#value iteration ), which iterates the Bellman optimality operator ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"fuAly8PGfR"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"D97cXjgzvO"},{"type":"text","value":"1.54","key":"x9gRRZa2Vv"},{"type":"text","value":")","key":"WN7YliAxm0"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"wqYMDx00P0"},{"type":"text","value":" to approximately compute the optimal value function.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"J9fVgvq8uO"}],"key":"cmXzXcuLkx"}],"key":"vsdhevnuDL"}],"key":"T5Vb2ozkbr"},{"type":"paragraph","position":{"start":{"line":157,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"Our existing approaches represent the value function, and the MDP itself,\nin matrix notation.\nBut what happens if the state space is extremely large, or even infinite (e.g. real-valued)?\nThen computing a weighted sum over all possible next states, which is required to compute the Bellman operator,\nbecomes intractable.","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"LaeZGNJPcB"}],"key":"YWyCV5Tpfx"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Instead, we will need to use ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"Ff72srpb81"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"function approximation","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"syoIJTVkMC"}],"key":"GUbIT7gnTN"},{"type":"text","value":" methods from supervised learning to solve for the value function in an alternative way.","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"qNryfGV8I5"}],"key":"YZVH09GXVW"},{"type":"paragraph","position":{"start":{"line":165,"column":1},"end":{"line":166,"column":1}},"children":[{"type":"text","value":"In particular, suppose we have a dataset of ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"XSXwgnweRu"},{"type":"inlineMath","value":"N","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"NNN","key":"jXGiqlSR6b"},{"type":"text","value":" trajectories ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"kSnysfw6Sm"},{"type":"inlineMath","value":"\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"τ1,,τNρπ\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}τ1,,τNρπ","key":"rsUUJrMq0y"},{"type":"text","value":" from some policy ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"Tv2YctNzXB"},{"type":"text","value":"π","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"HMwg8yWmym"},{"type":"text","value":" (called the ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"C4fbr6vukh"},{"type":"strong","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"text","value":"data collection policy","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"Wx2Tx9DhJN"}],"key":"BMsPWda8FK"},{"type":"text","value":") acting in the MDP of interest.\nLet us indicate the trajectory index in the superscript, so that","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"gdQPClsW6v"}],"key":"zTDLPiFyX5"},{"type":"math","value":"\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.","position":{"start":{"line":168,"column":1},"end":{"line":170,"column":1}},"html":"τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.","enumerator":"5.5","key":"moJnQn8Heo"}],"key":"hAo1JVLTvh"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def collect_data(\n env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None\n) -> list[Trajectory]:\n \"\"\"Collect a dataset of trajectories from the given policy (or a random one).\"\"\"\n trajectories = []\n seeds = [rand.bits(k).item() for k in rand.split(key, N)]\n for i in tqdm(range(N)):\n τ = []\n s, _ = env.reset(seed=seeds[i])\n for h in range(H):\n # sample from a random policy\n a = π(s, h) if π else env.action_space.sample()\n s_next, r, terminated, truncated, _ = env.step(a)\n τ.append(Transition(s, a, r))\n if terminated or truncated:\n break\n s = s_next\n trajectories.append(τ)\n return trajectories","key":"VnXdO7psKF"},{"type":"output","id":"UisSZXHmY_Iaacpwxvgoh","data":[],"key":"B9KgvEuUB2"}],"data":{},"key":"XckbXksuig"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"env = gym.make(\"LunarLander-v2\")\ntrajectories = collect_data(env, 100, 300, key)\ntrajectories[0][:5] # show first five transitions from first trajectory","key":"Fo9Ca27WfF"},{"type":"output","id":"k8YtlaYDO0W5vIl6NaGub","data":[{"output_type":"stream","name":"stderr","text":"\r 0%| | 0/100 [00:00Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\\hi^\\star(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [\\max_{a'} Q_{\\hi+1}^\\star(s', a')]Qh(s,a)=r(s,a)+EsP(s,a)[amaxQh+1(s,a)]","enumerator":"5.6","key":"Q48u4rORy3"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"nUAiRpNLoy"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"h\\hih","key":"T8gaSaBZDf"},{"type":"text","value":" --\nas the inputs ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"qSrrqTCcqU"},{"type":"inlineMath","value":"x","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"xxx","key":"xTFoIb80Ot"},{"type":"text","value":", and the r.h.s. of the above equation as the label ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"NlY6UR25T1"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"f(x)f(x)f(x)","key":"nh9KDOzArj"},{"type":"text","value":". Note that the r.h.s. can also be expressed as a ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"vjJwb7AM9Y"},{"type":"strong","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"khQYtvKMJ3"}],"key":"IA3Voo1jw9"},{"type":"text","value":":","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"EIzZGJ6pJd"}],"key":"rfdK17jep1"},{"type":"math","value":"f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').","position":{"start":{"line":211,"column":1},"end":{"line":213,"column":1}},"html":"f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').f(x)=E[yx]wherey=r(sh,ah)+amaxQh+1(s,a).","enumerator":"5.7","key":"MxjtTzmIff"},{"type":"paragraph","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Approximating the conditional expectation is precisely the task that ","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"VHSEGAQ2nv"},{"type":"crossReference","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Section ","key":"LKqjUWS9dt"},{"type":"text","value":"5.2","key":"PuzsTuM9Sv"}],"identifier":"erm","label":"erm","kind":"heading","template":"Section %s","enumerator":"5.2","resolved":true,"html_id":"erm","key":"JYefFqnsn0"},{"type":"text","value":" is suited for!","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"l3V7Ovd7Du"}],"key":"k6Fn3sMn59"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"Our above dataset would give us ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"kQW2eQCEQh"},{"type":"inlineMath","value":"N \\cdot \\hor","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"NHN \\cdot \\horNH","key":"wGq7SNMnSN"},{"type":"text","value":" samples in the dataset:","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"nGzbKsWAdM"}],"key":"hDUo10d5V0"},{"type":"math","value":"x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')","position":{"start":{"line":219,"column":1},"end":{"line":221,"column":1}},"html":"xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')xih=(shi,ahi,h)yih=r(shi,ahi)+amaxQh+1(sh+1i,a)","enumerator":"5.8","key":"Fs1AcvHuj4"}],"key":"bQM3ydcxQ5"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def get_X(trajectories: list[Trajectory]):\n \"\"\"\n We pass the state and timestep as input to the Q-function\n and return an array of Q-values.\n \"\"\"\n rows = [(τ[h].s, τ[h].a, h) for τ in trajectories for h in range(len(τ))]\n return [np.stack(ary) for ary in zip(*rows)]\n\n\ndef get_y(\n trajectories: list[Trajectory],\n f: Optional[QFunction] = None,\n π: Optional[Policy] = None,\n):\n \"\"\"\n Transform the dataset of trajectories into a dataset for supervised learning.\n If `π` is None, instead estimates the optimal Q function.\n Otherwise, estimates the Q function of π.\n \"\"\"\n f = f or Q_zero(get_num_actions(trajectories))\n y = []\n for τ in trajectories:\n for h in range(len(τ) - 1):\n s, a, r = τ[h]\n Q_values = f(s, h + 1)\n y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))\n y.append(τ[-1].r)\n return np.array(y)","key":"pBjV6iPEV5"},{"type":"output","id":"lYkVtBQEcerGUWBq-34fL","data":[],"key":"cJGe6pp4lK"}],"data":{},"key":"EPgVeJRIt6"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"s, a, h = get_X(trajectories[:1])\nprint(\"states:\", s[:5])\nprint(\"actions:\", a[:5])\nprint(\"timesteps:\", h[:5])","key":"QBQ2QxPmL7"},{"type":"output","id":"dDa0W4zHLWpUlEjucWk1A","data":[{"output_type":"stream","name":"stdout","text":"states: [[-0.00767412 1.4020356 -0.77731264 -0.39489663 0.00889908 0.17607279\n 0. 0. ]\n [-0.01526899 1.392572 -0.766254 -0.42065707 0.01559265 0.13388489\n 0. 0. ]\n [-0.02286405 1.3825084 -0.7662748 -0.44735536 0.02228237 0.13380653\n 0. 0. ]\n [-0.0304594 1.3718452 -0.7662946 -0.4740309 0.02897082 0.13378178\n 0. 0. ]\n [-0.03802614 1.361714 -0.7636849 -0.45042533 0.03589968 0.1385901\n 0. 0. ]]\nactions: [3 0 0 2 2]\ntimesteps: [0 1 2 3 4]\n"}],"key":"qsMc4JRcHj"}],"data":{},"key":"iLY7TMZfpj"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"get_y(trajectories[:1])[:5]","key":"XOFyf8OdvZ"},{"type":"output","id":"01EPeySRKZVc8Ir-eoDKQ","data":[{"output_type":"execute_result","execution_count":6,"metadata":{},"data":{"text/plain":{"content":"Array([ 0.01510799, -0.9906127 , -0.9934895 , 1.4450092 , 0.43907362], dtype=float32)","content_type":"text/plain"}}}],"key":"gg0ibtQ6Vo"}],"data":{},"key":"BF90lrezTS"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"Then we can use empirical risk minimization to find a function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"ACAokekxbX"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"f^\\hat ff^","key":"oxYwdzqw1X"},{"type":"text","value":" that approximates the optimal Q-function.","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"lvhZju6wEK"}],"key":"s7fuV96oVd"}],"key":"blJFEsSX1c"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# We will see some examples of fitting methods in the next section\nFittingMethod = Callable[[Float[Array, \"N D\"], Float[Array, \" N\"]], QFunction]","key":"h1LQR20tfD"},{"type":"output","id":"Ljiokj6taEDHmUOKW7Whn","data":[],"key":"yNmUvQLZIP"}],"data":{},"key":"jqE2WsZNFG"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":272,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"But notice that the definition of ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"nUaSfFkc4z"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"e6V3DTfR7E"},{"type":"text","value":" depends on the Q-function itself!\nHow can we resolve this circular dependency?\nRecall that we faced the same issue ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"HKaBB9p8pr"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"when evaluating a policy in an infinite-horizon MDP","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"pfyituWKlt"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"gHQbSGaZUj"},{"type":"text","value":". There, we iterated the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"iVTcLNmKBF"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"Definition ","key":"nUSexCMHB9"},{"type":"text","value":"1.8","key":"AAFXdefdTQ"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"RNy4GG8lxh"},{"type":"text","value":" since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator.\nWe can apply the same strategy here, using the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"g4sKt18UiW"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"f^\\hat ff^","key":"BXWXvYmw1q"},{"type":"text","value":" from the previous iteration to compute the labels ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"NIhElsxbe6"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"XwuiiQjnlY"},{"type":"text","value":",\nand then using this new dataset to fit the next iterate.","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"IcWX8N7cHk"}],"key":"nJXLw9mDJV"},{"type":"proof","kind":"definition","label":"fitted_q_iteration","identifier":"fitted_q_iteration","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted Q-function iteration","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"QkiIGQX1ly"}],"key":"n8PCqkLzN9"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":281,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"qju2OYTC9i"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"n864YOwkEl"},{"type":"text","value":".","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"GbvBdQQ3F5"}],"key":"xwEZoDB4Re"},{"type":"listItem","spread":true,"position":{"start":{"line":282,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"SlsFYKMavk"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":283,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"pPFR6AqBJe"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"X,yX, yX,y","key":"y157PLAwR6"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"UDloivl6vv"},{"type":"inlineMath","value":"f","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"fff","key":"k4XDxFG0gB"},{"type":"text","value":", where the labels come from the r.h.s. of the Bellman optimality operator ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"i76doX051C"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"cPXMEvzw9u"},{"type":"text","value":"1.54","key":"Mxvys2sD88"},{"type":"text","value":")","key":"m1dN1fdx4P"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"x9RFFkUBzc"}],"key":"b2mGk3Q1xN"}],"key":"pf98YPhgKk"},{"type":"listItem","spread":true,"position":{"start":{"line":284,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"HI4Oqn8UAe"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"f^\\hat ff^","key":"xwmUo4f9OB"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"cXlNw33bQX"}],"key":"AOeveyjDz0"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":286,"column":1},"end":{"line":286,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.9","key":"yjCnmYxqlY"}],"key":"ICUqqe9PZ8"}],"key":"p9LFLk2wMG"}],"key":"WEKim84DvH"}],"key":"ZRt77LG3Gl"}],"enumerator":"5.2","html_id":"fitted-q-iteration","key":"fI5QPUOycH"}],"key":"rSSDvwKMhy"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_q_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted Q-function iteration using the given dataset.\n Returns an estimate of the optimal Q-function.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in range(epochs):\n y = get_y(trajectories, Q_hat)\n Q_hat = fit(X, y)\n return Q_hat","key":"o1zc5qC3ZN"},{"type":"output","id":"rngIsQEeWQnWrE8fSIwLD","data":[],"key":"TFMp2AwzdB"}],"data":{},"key":"YHrz0n2YDh"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"We can also use this fixed-point interation to ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"ktT9KYvGuM"},{"type":"emphasis","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"sHQaOQ9TAN"}],"key":"hMbWTku6AE"},{"type":"text","value":" a policy using the dataset (not necessarily the one used to generate the trajectories):","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"uwgQSxVB6t"}],"key":"vdGRkbSaT3"},{"type":"proof","kind":"definition","label":"fitted_evaluation","identifier":"fitted_evaluation","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted policy evaluation","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"hyHN7K7ne1"}],"key":"GFj2sEY2rF"},{"type":"paragraph","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"children":[{"type":"strong","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"sUguAbYoxS"}],"key":"m1Q9RN31WU"},{"type":"text","value":" Policy ","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"jj777EKfGD"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"html":"π:S×[H]Δ(A)\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})π:S×[H]Δ(A)","key":"kJROcJ8012"},{"type":"text","value":" to be evaluated.","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"ceQnDUqnJB"}],"key":"XVMKATYUPj"},{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"strong","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"Output:","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"RedKPUMZOF"}],"key":"ZrGO4xZPlq"},{"type":"text","value":" An approximation of the value function ","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"ttDuWm3y93"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"html":"QπQ^\\piQπ","key":"sGhnB6b72I"},{"type":"text","value":" of the policy.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"NrkuuNvxe1"}],"key":"L13r0KAKpC"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":317,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"key":"xmk16C4cyH"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"lsNwBMLXMX"},{"type":"text","value":".","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"key":"VawkbTb3Ga"}],"key":"SgxAgW7zn7"},{"type":"listItem","spread":true,"position":{"start":{"line":318,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"NvYkS2ytBl"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":319,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"jgIlEXYXo1"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"X,yX, yX,y","key":"lgGDZnqtEf"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"tHVWm4nXM0"},{"type":"inlineMath","value":"f","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"fff","key":"jOXWavhNJP"},{"type":"text","value":", where the labels come from the r.h.s. of the ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"DrMyGGrx5C"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Bellman consistency equation","key":"Cp9RJPvLpk"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"xUpEAteCfT"},{"type":"text","value":" for the given policy.","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"LlM8vegrBk"}],"key":"vRnNwki4in"}],"key":"fU3fKyFxaG"},{"type":"listItem","spread":true,"position":{"start":{"line":320,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"T8vXVZSW0A"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"f^\\hat ff^","key":"Lqbt1RPscm"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"P6IuC26wbE"}],"key":"ehoSJXl8gL"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.10","key":"PSak6mBlEs"}],"key":"SIY568aH3c"}],"key":"zZJsyYzSAM"}],"key":"y6jZwToQ5m"}],"key":"CcYmYc9Wkv"}],"enumerator":"5.3","html_id":"fitted-evaluation","key":"hPBntLgH7L"}],"key":"XAbpFbgMHT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_evaluation(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n π: Policy,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted policy evaluation using the given dataset.\n Returns an estimate of the Q-function of the given policy.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in tqdm(range(epochs)):\n y = get_y(trajectories, Q_hat, π)\n Q_hat = fit(X, y)\n return Q_hat","key":"AQzD5Z9HTe"},{"type":"output","id":"EInXwQepFk5Y8NUP9xEMj","data":[],"key":"BTIx5W4TMO"}],"data":{},"key":"ao2yNlgjFY"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"FnAN6cE9DJ"}],"key":"Nn9wbVowVF"},{"type":"paragraph","position":{"start":{"line":346,"column":1},"end":{"line":347,"column":1}},"children":[{"type":"text","value":"Spot the difference between ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"A36fnja6db"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"N37XUepxiq"},{"type":"text","value":" and ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"UVAAn11otM"},{"type":"inlineCode","value":"fitted_q_iteration","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"YQWKapkleE"},{"type":"text","value":". (See the definition of ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"CW0quZNwAs"},{"type":"inlineCode","value":"get_y","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"uqyKMIBVP5"},{"type":"text","value":".)\nHow would you modify this algorithm to evaluate the data collection policy?","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"xAC0VX7K1w"}],"key":"rIdgYw7CvI"}],"key":"ZNOumUs6yX"},{"type":"paragraph","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"text","value":"We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm ) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative ","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"LriuiQbLT7"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"w0HMYs6ICS"},{"type":"text","value":" algorithm.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"IttOH4YFZ0"}],"key":"VU6kujl1Jo"}],"key":"qvnNIC0zHU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_policy_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n evaluation_epochs: int,\n π_init: Optional[Policy] = lambda s, h: 0, # constant zero policy\n):\n \"\"\"Run fitted policy iteration using the given dataset.\"\"\"\n π = π_init\n for _ in range(epochs):\n Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)\n π = q_to_greedy(Q_hat)\n return π","key":"IquIIeYl9F"},{"type":"output","id":"mk7k8PhH1ign1fEqp3iON","data":[],"key":"QERQlrvDer"}],"data":{},"key":"j0retBrZDf"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":368,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":368,"column":1},"end":{"line":368,"column":1}},"key":"KaHnU7Hxxx"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"5.4","key":"tOyuOJwbtV"}],"key":"AAlfb1DQvU"}],"key":"wwNI7tyPYs"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"6 Policy Gradient Methods","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/imitation-learning.html b/imitation-learning.html index 20b9166..721b45d 100644 --- a/imitation-learning.html +++ b/imitation-learning.html @@ -14,10 +14,10 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

7 Imitation Learning

7.1Introduction

Imagine you are tasked with learning how to drive. How do, or did, you go about it? At first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error. Luckily, there are already people in the world who know how to drive who can get you started. In this and many other examples, we all “stand on the shoulders of giants” and learn skills from experts who have already mastered them.

Now in machine learning, much of the time, we are trying to teach machines to accomplish tasks that us humans are already proficient at. @@ -31,22 +31,22 @@ How does this relate to interactive tasks? Here, the input is the observation seen by the agent and the output is the action it selects, so the mapping is the agent’s policy. What’s stopping us from applying supervised learning techniques? -In practice, nothing! This is called behavioral cloning.

Typically, this second task can be framed as empirical loss minimization:

π~=argminπΠn=0N1loss(π(sn),an)\tilde \pi = \arg\min_{\pi \in \Pi} \sum_{n=0}^{N-1} \text{loss}(\pi(s^n), a^n)

where Π is some class of possible policies, loss\text{loss} is the loss function to measure how far off the policy’s prediction is, and the SL algorithm tells us how to compute this argmin\arg\min. +In practice, nothing! This is called behavioral cloning.

Typically, this second task can be framed as empirical loss minimization:

π~=argminπΠn=0N1loss(π(sn),an)\tilde \pi = \arg\min_{\pi \in \Pi} \sum_{n=0}^{N-1} \text{loss}(\pi(s^n), a^n)

where Π is some class of possible policies, loss\text{loss} is the loss function to measure how far off the policy’s prediction is, and the SL algorithm tells us how to compute this argmin\arg\min. If training a deterministic policy that is just a function from inputs to outputs with no randomness, we might try to minimize the mean squared error. More generally, though, we often choose the negative log likelihood as our loss function, so that the optimization is equivalent to maximum likelihood estimation: -out of the space of all possible mappings, we search for the one according to which the training dataset is the most likely.

π~=argmaxπΠPanπ(sn)(a0:Ns0:N)\tilde \pi = \arg\max_{\pi \in \Pi} \pr_{a^n \sim \pi(s^n)}(a^{0:N} \mid s^{0:N})

Can we quantify how well this algorithm works? +out of the space of all possible mappings, we search for the one according to which the training dataset is the most likely.

π~=argmaxπΠPanπ(sn)(a0:Ns0:N)\tilde \pi = \arg\max_{\pi \in \Pi} \pr_{a^n \sim \pi(s^n)}(a^{0:N} \mid s^{0:N})

Can we quantify how well this algorithm works? For simplicity, let’s consider the case where the action space is discrete and both the data and trained policy are deterministic. (This corresponds to a classification task in SL.) Suppose the SL algorithm obtains ε\varepsilon classification error. That is, for trajectories drawn from the expert policy, -the learned policy chooses a different action at most ε\varepsilon of the time:

Eτρπdata[1Hh=0H11{π~(sh)πdata(sh)}]ε\mathbb{E}_{\tau \sim \rho_{\pi_{\text{data}}}} \left[ \frac 1 \hor \sum_{\hi=0}^{\hor-1} \ind{ \tilde \pi(s_\hi) \ne \pi_{\text{data}} (s_\hi) } \right] \le \varepsilon

Then, their value functions differ by

VπdataVπ~H2ε| V^{\pi_{\text{data}}} - V^{\tilde \pi} | \le H^2 \varepsilon

where HH is the horizon.

7.3Distribution shift

Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned. This is the issue of distribution shift: a policy learned under some distribution of states may not perform well if this distribution changes.

This is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed. In interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behaviour; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.

How could you learn a strategy for these new settings? In the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way. Then the next time you go for a drive, you can remember the expert’s advice, and take a safer route. You could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations. -This is the key idea behind dataset aggregation.

7.4Dataset aggregation (DAgger)

The DAgger algorithm is due to Ross et al. (2010).

def dagger_pseudocode(
+This is the key idea behind dataset aggregation.

7.4Dataset aggregation (DAgger)

The DAgger algorithm is due to Ross et al. (2010).

def dagger_pseudocode(
     env: MAB,
     π_init: Policy,
     π_expert: Policy,
@@ -66,9 +66,9 @@
         
         π = fit(dataset)
     
-    return π

How well does DAgger perform?

References
  1. Ross, S., Gordon, G. J., & Bagnell, J. (2010, November). A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning. International Conference on Artificial Intelligence and Statistics.
\ No newline at end of file diff --git a/imitation-learning.json b/imitation-learning.json index 564cf9d..5714b0e 100644 --- a/imitation-learning.json +++ b/imitation-learning.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"1e76726d66e846c6b0aed795c9cfc8b5359c0fc8bc249124a868f2881ec3941c","slug":"imitation-learning","location":"/imitation_learning.md","dependencies":[],"frontmatter":{"title":"7 Imitation Learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"7.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"imitation_learning.md","url":"/build/imitation_learning-bf860cb6679fb159939c7b8b45aabd4b.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"nTjV4KsWY7"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"7.1","key":"aUDd2iSYrG"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"Imagine you are tasked with learning how to drive. How do, or did, you go about it?\nAt first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error.\nLuckily, there are already people in the world who know how to drive who can get you started.\nIn this and many other examples, we all “stand on the shoulders of giants” and learn skills from experts who have already mastered them.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"iqrCjVVXff"}],"key":"Bpsc6k3PNX"},{"type":"paragraph","position":{"start":{"line":25,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Now in machine learning, much of the time, we are trying to teach machines to accomplish tasks that us humans are already proficient at.\nIn such cases, the machine learning algorithm is the one learning the new skill, and humans are the “experts” that can demonstrate how to perform the task.\n","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"E1rjfQty9q"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Imitation learning","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"wfwn9UvVsC"}],"key":"CKo64pjidI"},{"type":"text","value":" is a direct application of this idea to machine learning for interactive tasks.\nWe’ll see that the most naive form of imitation learning, called ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"TPn5KFM6QV"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"bh8BMQtsxW"}],"key":"s2E1BuPFkM"},{"type":"text","value":", is really an application of supervised learning to interactive tasks.\nWe’ll then explore ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"GodsoesRQV"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"kxrVaenTpL"}],"key":"YZtUdJeaTw"},{"type":"text","value":" (DAgger) as a way to query an expert and learn even more effectively.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"VTRkMU2xan"}],"key":"KFsGqWvK3D"},{"type":"heading","depth":2,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"aXBH5tn2DH"}],"identifier":"behavioral-cloning","label":"Behavioral cloning","html_id":"behavioral-cloning","implicit":true,"enumerator":"7.2","key":"MYkha7zjHt"},{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"This notion of “learning from human-provided data” may remind you of the basic premise of ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"b9tkAFvtI4"},{"type":"link","url":"/supervised-learning","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"ziZ5XDepBN"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"GSUoX4dVUV"},{"type":"text","value":",\nin which there is some mapping from ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"UUfLJOpual"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"inputs","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"FfZCJKn56Q"}],"key":"rtPcF2s8Wj"},{"type":"text","value":" to ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"jTkFQQPDYX"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"outputs","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"QSJlOaAger"}],"key":"GcQkXpHg3G"},{"type":"text","value":" that us humans can implicitly compute, such as seeing a photo and being able to recognize its constituents.\nTo teach a machine to calculate this mapping, we first collect a large ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"WKZe569iLq"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"training dataset","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"dr2Eded2PF"}],"key":"qJqkRDT2aR"},{"type":"text","value":" by getting people to label a lot of inputs,\nand then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible.\nHow does this relate to interactive tasks?\nHere, the input is the observation seen by the agent and the output is the action it selects, so the mapping is the agent’s policy.\nWhat’s stopping us from applying supervised learning techniques?\nIn practice, nothing! This is called ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"J2wnDoZDB2"},{"type":"strong","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"behavioral cloning.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"a2WmUhBm3N"}],"key":"DXHUi2T23f"}],"key":"cU0hIfBtus"},{"type":"proof","kind":"definition","label":"behavioral_cloning","identifier":"behavioral_cloning","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"b5MrA04bx9"}],"key":"vYuSxBlLyx"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":46,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":46,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Collect a training dataset of trajectories generated by an expert policy ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"k7dLIxbYwk"},{"type":"inlineMath","value":"\\pi_\\text{data}","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"πdata\\pi_\\text{data}πdata","key":"vmjA3RtnrR"},{"type":"text","value":". Here, we treat each state-action pair as independent, resuling in a dataset ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"LrY5Za9zVX"},{"type":"inlineMath","value":"\\mathcal{D} = (s^n, a^n)_{n=1}^{N}","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"D=(sn,an)n=1N\\mathcal{D} = (s^n, a^n)_{n=1}^{N}D=(sn,an)n=1N","key":"GRCyb5w9eP"},{"type":"text","value":". (For concreteness, if there are ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"WQsDHRnyaW"},{"type":"inlineMath","value":"M","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"MMM","key":"EjJ6cYRvih"},{"type":"text","value":" trajectories with a horizon ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"JNM1NgLIPf"},{"type":"inlineMath","value":"H","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"HHH","key":"y3ee5PP0no"},{"type":"text","value":", then ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"Ix6kcixH4N"},{"type":"inlineMath","value":"N = M \\times H","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"N=M×HN = M \\times HN=M×H","key":"n6emBDXFyx"},{"type":"text","value":".)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"O3JaEH2uTn"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Note that this is an inaccurate approximation! A key property of interactive tasks is that the agent’s output -- the action that it takes -- may influence its next observation.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"yqe5qdKuIf"}],"key":"QkYCzsHq6A"}],"key":"fOerDO8QCJ"}],"key":"le6n5jdil1"},{"type":"listItem","spread":true,"position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Use a SL algorithm ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"Pa5g9oiA60"},{"type":"inlineMath","value":"\\texttt{fit} : \\mathcal{D} \\mapsto \\tilde \\pi","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"fit:Dπ~\\texttt{fit} : \\mathcal{D} \\mapsto \\tilde \\pifit:Dπ~","key":"gt6xZEai3e"},{"type":"text","value":" to extract a policy ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"Y2SxxSee7m"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"π~\\tilde \\piπ~","key":"YTgl3gMyxM"},{"type":"text","value":" that approximates the expert policy.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"t0uZY26SLY"}],"key":"fObapHNzRi"}],"key":"cHm43lIzDB"}],"enumerator":"7.1","html_id":"behavioral-cloning","key":"LcNIHQaJgF"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"Typically, this second task can be framed as ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"gLizvhdE2n"},{"type":"strong","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"empirical loss minimization","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"DaWVzXuw51"}],"key":"SwuZQCBOol"},{"type":"text","value":":","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"ZaVoy5lHMC"}],"key":"IZDP5hsdWC"},{"type":"math","value":"\\tilde \\pi = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)","html":"π~=argminπΠn=0N1loss(π(sn),an)\\tilde \\pi = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)π~=argπΠminn=0N1loss(π(sn),an)","enumerator":"7.1","key":"RouyQsb0w0"},{"type":"paragraph","position":{"start":{"line":57,"column":1},"end":{"line":60,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"h1u1Adn2ru"},{"type":"text","value":"Π","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"xZD87W3Kj9"},{"type":"text","value":" is some class of possible policies, ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"GCMOjAcTwQ"},{"type":"inlineMath","value":"\\text{loss}","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"loss\\text{loss}loss","key":"ryEK9owKUj"},{"type":"text","value":" is the loss function to measure how far off the policy’s prediction is, and the SL algorithm tells us how to compute this ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"WMe9HdToe2"},{"type":"inlineMath","value":"\\arg\\min","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"argmin\\arg\\minargmin","key":"Q0zJeKA5ep"},{"type":"text","value":".\nIf training a deterministic policy that is just a function from inputs to outputs with no randomness, we might try to minimize the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"adcB5vFRos"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"mean squared error","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"QA7ErsHbeW"}],"key":"CvWEzjAQPj"},{"type":"text","value":".\nMore generally, though, we often choose the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"Yx3jEixJZo"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"negative log likelihood","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"KTYL6xSAlF"}],"key":"Sq382vo8ab"},{"type":"text","value":" as our loss function, so that the optimization is equivalent to ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"EPFt3IVPhg"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"maximum likelihood estimation","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"EOx5LTtJZv"}],"key":"AO5XkJchQE"},{"type":"text","value":":\nout of the space of all possible mappings, we search for the one according to which the training dataset is the most likely.","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"SFjVKcqiSo"}],"key":"c8zlyaFwl9"},{"type":"math","value":"\\tilde \\pi = \\arg\\max_{\\pi \\in \\Pi} \\pr_{a^n \\sim \\pi(s^n)}(a^{0:N} \\mid s^{0:N})","html":"π~=argmaxπΠPanπ(sn)(a0:Ns0:N)\\tilde \\pi = \\arg\\max_{\\pi \\in \\Pi} \\pr_{a^n \\sim \\pi(s^n)}(a^{0:N} \\mid s^{0:N})π~=argπΠmaxPanπ(sn)(a0:Ns0:N)","enumerator":"7.2","key":"R4VfHEh2Vc"},{"type":"paragraph","position":{"start":{"line":66,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Can we quantify how well this algorithm works?\nFor simplicity, let’s consider the case where the action space is discrete and both the data and trained policy are deterministic.\n(This corresponds to a classification task in SL.)\nSuppose the SL algorithm obtains ","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"OI4BmwT3cp"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"html":"ε\\varepsilonε","key":"dxfbcUEhcl"},{"type":"text","value":" classification error.\nThat is, for trajectories drawn from the expert policy,\nthe learned policy chooses a different action at most ","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"h8ZMH3QtWA"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"html":"ε\\varepsilonε","key":"t0BqYV3yR9"},{"type":"text","value":" of the time:","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"bYI1KAZ3Cf"}],"key":"EFZl09cpwJ"},{"type":"math","value":"\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{data}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\tilde \\pi(s_\\hi) \\ne \\pi_{\\text{data}} (s_\\hi) } \\right] \\le \\varepsilon","html":"Eτρπdata[1Hh=0H11{π~(sh)πdata(sh)}]ε\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{data}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\tilde \\pi(s_\\hi) \\ne \\pi_{\\text{data}} (s_\\hi) } \\right] \\le \\varepsilonEτρπdata[H1h=0H11{π~(sh)=πdata(sh)}]ε","enumerator":"7.3","key":"tSvyriACXu"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Then, their value functions differ by","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"TW9X1rsgGH"}],"key":"J68hyJSiYm"},{"type":"math","value":"| V^{\\pi_{\\text{data}}} - V^{\\tilde \\pi} | \\le H^2 \\varepsilon","html":"VπdataVπ~H2ε| V^{\\pi_{\\text{data}}} - V^{\\tilde \\pi} | \\le H^2 \\varepsilonVπdataVπ~H2ε","enumerator":"7.4","key":"ve8PEP5keM"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"btenoHvdxs"},{"type":"inlineMath","value":"H","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"HHH","key":"MhNCFROzXT"},{"type":"text","value":" is the horizon.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"IKvgcDPUpi"}],"key":"A7b5dyalZs"},{"type":"proof","kind":"theorem","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance of behavioral cloning","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"DYFK7APp8D"}],"key":"lj7yoEjbqQ"},{"type":"paragraph","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"ejyd8o45UT"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"sbxvl6w8W5"},{"type":"text","value":"6.1","key":"bVihDMk8Rf"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","remote":true,"url":"/pg","dataUrl":"/pg.json","key":"cmDYa0Umks"},{"type":"text","value":" allows us to express the difference between ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"iH1uMx4Oal"},{"type":"inlineMath","value":"\\pi_{\\text{data}}","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"html":"πdata\\pi_{\\text{data}}πdata","key":"PVc2BUEjMT"},{"type":"text","value":" and ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"XFYrCWOytB"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"html":"π~\\tilde \\piπ~","key":"xdTX3luH8u"},{"type":"text","value":" as","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"FimwuU7ugT"}],"key":"Y0dXkwtG3p"},{"type":"math","value":"V_0^{\\pi_{\\text{data}}}(s) - V_0^{\\tilde \\pi} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{data}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\tilde \\pi} (s_\\hi, a_\\hi) \\right].","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"V0πdata(s)V0π~(s)=Eτρπdatas0=s[h=0H1Ahπ~(sh,ah)].V_0^{\\pi_{\\text{data}}}(s) - V_0^{\\tilde \\pi} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{data}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\tilde \\pi} (s_\\hi, a_\\hi) \\right].V0πdata(s)V0π~(s)=Eτρπdatas0=s[h=0H1Ahπ~(sh,ah)].","enumerator":"7.5","key":"F7nK6045qk"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"text","value":"Now since the data policy is deterministic, we can substitute ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"jC33aWEOZN"},{"type":"inlineMath","value":"a_\\hi = \\pi_{\\text{data}}(s_\\hi)","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"ah=πdata(sh)a_\\hi = \\pi_{\\text{data}}(s_\\hi)ah=πdata(sh)","key":"XYgDUIvTgZ"},{"type":"text","value":".\nThis allows us to make a further simplification:\nsince ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"Vr3Z2csdRd"},{"type":"inlineMath","value":"\\pi_{\\text{data}}","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"πdata\\pi_{\\text{data}}πdata","key":"sQiJ700oRC"},{"type":"text","value":" is deterministic, we have","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"qHi4WgGpAT"}],"key":"G7jLrm0YUr"},{"type":"math","value":"A^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) = Q^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) - V^{\\pi_{\\text{data}}}(s) = 0.","position":{"start":{"line":97,"column":1},"end":{"line":99,"column":1}},"html":"Aπdata(s,πdata(s))=Qπdata(s,πdata(s))Vπdata(s)=0.A^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) = Q^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) - V^{\\pi_{\\text{data}}}(s) = 0.Aπdata(s,πdata(s))=Qπdata(s,πdata(s))Vπdata(s)=0.","enumerator":"7.6","key":"EnrI5AcxaM"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"Now we can use the assumption that the SL algorithm obtains ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"sKCzbwqJfH"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"ε\\varepsilonε","key":"Lw9xuzUetQ"},{"type":"text","value":" classification error. By the above, ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"yXbEGTcqMy"},{"type":"inlineMath","value":"A_\\hi^{\\tilde \\pi}(s_\\hi, \\pi_{\\text{data}}(s_\\hi)) = 0","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"Ahπ~(sh,πdata(sh))=0A_\\hi^{\\tilde \\pi}(s_\\hi, \\pi_{\\text{data}}(s_\\hi)) = 0Ahπ~(sh,πdata(sh))=0","key":"FNYuX1UGkL"},{"type":"text","value":" when ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"YcpO471LJN"},{"type":"inlineMath","value":"\\pi_{\\text{data}}(s_\\hi) = \\tilde \\pi(s_\\hi)","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"πdata(sh)=π~(sh)\\pi_{\\text{data}}(s_\\hi) = \\tilde \\pi(s_\\hi)πdata(sh)=π~(sh)","key":"kzOv5CHhpy"},{"type":"text","value":". In the case where the two policies differ on ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"vS8zZxDg0m"},{"type":"inlineMath","value":"s_\\hi","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"shs_\\hish","key":"F7CzpxUdpD"},{"type":"text","value":", which occurs with probability ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"YlIcLIO9yM"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"ε\\varepsilonε","key":"ya9u8b94UM"},{"type":"text","value":", the advantage is naively upper bounded by ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"LEIolV4Xnf"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"ieI9ECCQvJ"},{"type":"text","value":" (assuming rewards are bounded between ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"tkYWQ0PqyP"},{"type":"text","value":"0","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"u154xSRj3g"},{"type":"text","value":" and ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"GDRaw4xODF"},{"type":"text","value":"1","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"aiMAcGXGzP"},{"type":"text","value":"). Taking the final sum gives the desired bound.","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"zJdMgqluBK"}],"key":"qJxNpAKWh2"}],"enumerator":"7.1","key":"AWc1ty26Vx"},{"type":"comment","value":" TODO ADD DISTRIBUTION SHIFT EXAMPLE FROM SLIDES ","key":"QMILszjTXr"},{"type":"heading","depth":2,"position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Distribution shift","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"lEH5rL773w"}],"identifier":"distribution-shift","label":"Distribution shift","html_id":"distribution-shift","implicit":true,"enumerator":"7.3","key":"ZMseiUpnFF"},{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned.\nThis is the issue of ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"beQo7MVjNh"},{"type":"emphasis","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"distribution shift","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"ipJLS35mgS"}],"key":"EcWB89tWY8"},{"type":"text","value":": a policy learned under some distribution of states may not perform well if this distribution changes.","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"Owtxbc15kC"}],"key":"fHkXNEASEx"},{"type":"paragraph","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"children":[{"type":"text","value":"This is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed. In interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behaviour; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"Hh9AXly8oB"}],"key":"UOQkim59xQ"},{"type":"paragraph","position":{"start":{"line":113,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"text","value":"How could you learn a strategy for these new settings?\nIn the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way.\nThen the next time you go for a drive, you can remember the expert’s advice, and take a safer route.\nYou could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations.\nThis is the key idea behind ","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"rI3Iwby4Dk"},{"type":"emphasis","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"yo9CsXHC6t"}],"key":"ZUsQFRodBv"},{"type":"text","value":".","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"gFDfesA71r"}],"key":"C02gE9Dbuw"},{"type":"heading","depth":2,"position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"children":[{"type":"text","value":"Dataset aggregation (DAgger)","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"Qg4jFSc1gT"}],"identifier":"dataset-aggregation-dagger","label":"Dataset aggregation (DAgger)","html_id":"dataset-aggregation-dagger","implicit":true,"enumerator":"7.4","key":"QFyp4B4xfF"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"The DAgger algorithm is due to ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"zIPLDjVfE2"},{"type":"cite","kind":"narrative","label":"ross_reduction_2010","identifier":"ross_reduction_2010","children":[{"type":"text","value":"Ross ","key":"dgpnUv31NQ"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"ovfVJ5QJRw"}],"key":"nMd2kTjVIW"},{"type":"text","value":" (2010)","key":"kGefshH6DS"}],"enumerator":"1","key":"jRTc1pXG27"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"cRAXzsiQs3"}],"key":"n1DQGjMR8P"},{"type":"code","lang":"python","value":"def dagger_pseudocode(\n env: MAB,\n π_init: Policy,\n π_expert: Policy,\n n_dagger_iterations: int,\n n_trajectories_per_iteration: int\n):\n π = π_init\n dataset = set()\n\n for _ in range(n_dagger_iterations):\n for __ in range(n_trajectories_per_iteration):\n τ = collect_trajectory(π, env)\n for step in range(env.H):\n obs = τ.state[step]\n τ.action[step] = π_expert(obs)\n dataset.add(τ)\n \n π = fit(dataset)\n \n return π","position":{"start":{"line":123,"column":1},"end":{"line":145,"column":1}},"key":"uAQ1IghWCD"},{"type":"paragraph","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"How well does DAgger perform?","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"eJk3o6XU16"}],"key":"FOTsR9fa7f"},{"type":"comment","value":" TODO ","key":"DKxOVPhgW6"}],"key":"a9PHInb4hg"}],"key":"Y9pJQRx8pk"},"references":{"cite":{"order":["ross_reduction_2010"],"data":{"ross_reduction_2010":{"label":"ross_reduction_2010","enumerator":"1","html":"Ross, S., Gordon, G. J., & Bagnell, J. (2010, November). A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning. International Conference on Artificial Intelligence and Statistics."}}}},"footer":{"navigation":{"prev":{"title":"6 Policy Optimization","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"8 Planning","url":"/planning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"1e76726d66e846c6b0aed795c9cfc8b5359c0fc8bc249124a868f2881ec3941c","slug":"imitation-learning","location":"/imitation_learning.md","dependencies":[],"frontmatter":{"title":"7 Imitation Learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"7.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"imitation_learning.md","url":"/build/imitation_learning-bf860cb6679fb159939c7b8b45aabd4b.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"oXh8i5tLc1"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"7.1","key":"uP4xl71ybO"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"Imagine you are tasked with learning how to drive. How do, or did, you go about it?\nAt first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error.\nLuckily, there are already people in the world who know how to drive who can get you started.\nIn this and many other examples, we all “stand on the shoulders of giants” and learn skills from experts who have already mastered them.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"FhLwrFWBDc"}],"key":"vUi3Jmhvye"},{"type":"paragraph","position":{"start":{"line":25,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Now in machine learning, much of the time, we are trying to teach machines to accomplish tasks that us humans are already proficient at.\nIn such cases, the machine learning algorithm is the one learning the new skill, and humans are the “experts” that can demonstrate how to perform the task.\n","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"YBtrkvr1ux"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Imitation learning","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"AcTbzYoL2c"}],"key":"MwYIHSbNoW"},{"type":"text","value":" is a direct application of this idea to machine learning for interactive tasks.\nWe’ll see that the most naive form of imitation learning, called ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"h9hwMUhAWv"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"eZLOaJVA9u"}],"key":"op1EzClOfH"},{"type":"text","value":", is really an application of supervised learning to interactive tasks.\nWe’ll then explore ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"KhdtZdKLyG"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"Vry7ZGQBma"}],"key":"O2fa4giLrL"},{"type":"text","value":" (DAgger) as a way to query an expert and learn even more effectively.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"sveMZgpiXY"}],"key":"GX6ckKdf8M"},{"type":"heading","depth":2,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"dDhbIaX9jU"}],"identifier":"behavioral-cloning","label":"Behavioral cloning","html_id":"behavioral-cloning","implicit":true,"enumerator":"7.2","key":"mOmaFJf5hh"},{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"This notion of “learning from human-provided data” may remind you of the basic premise of ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"Ks466pQ2mN"},{"type":"link","url":"/supervised-learning","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"HEQfpGCu1M"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"ZSKRy5rXyu"},{"type":"text","value":",\nin which there is some mapping from ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"EqGgfqd0IE"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"inputs","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"Aq8YqIdIlP"}],"key":"LdgE3k6DHw"},{"type":"text","value":" to ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"E7Z2cqIrsy"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"outputs","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"SGHap08AG5"}],"key":"rvJPWDFbui"},{"type":"text","value":" that us humans can implicitly compute, such as seeing a photo and being able to recognize its constituents.\nTo teach a machine to calculate this mapping, we first collect a large ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"jPze3EXvV9"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"training dataset","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"vLVnalZlcM"}],"key":"f9nOvVsYOv"},{"type":"text","value":" by getting people to label a lot of inputs,\nand then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible.\nHow does this relate to interactive tasks?\nHere, the input is the observation seen by the agent and the output is the action it selects, so the mapping is the agent’s policy.\nWhat’s stopping us from applying supervised learning techniques?\nIn practice, nothing! This is called ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"NueeScPtUY"},{"type":"strong","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"behavioral cloning.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"rwxQzYSm6n"}],"key":"f3bpMdCJXK"}],"key":"D3P90FE71O"},{"type":"proof","kind":"definition","label":"behavioral_cloning","identifier":"behavioral_cloning","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"CuoTGx7I3Y"}],"key":"SVA85mArSp"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":46,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":46,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Collect a training dataset of trajectories generated by an expert policy ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"dFdWNpB6Z0"},{"type":"inlineMath","value":"\\pi_\\text{data}","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"πdata\\pi_\\text{data}πdata","key":"DQFdIqpqRS"},{"type":"text","value":". Here, we treat each state-action pair as independent, resuling in a dataset ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"OZVzizL1ox"},{"type":"inlineMath","value":"\\mathcal{D} = (s^n, a^n)_{n=1}^{N}","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"D=(sn,an)n=1N\\mathcal{D} = (s^n, a^n)_{n=1}^{N}D=(sn,an)n=1N","key":"obD0KGPTCb"},{"type":"text","value":". (For concreteness, if there are ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"cWhTonljeO"},{"type":"inlineMath","value":"M","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"MMM","key":"GrOQxW3Eii"},{"type":"text","value":" trajectories with a horizon ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"HwNFWVZKua"},{"type":"inlineMath","value":"H","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"HHH","key":"qZ263wYgE6"},{"type":"text","value":", then ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"eyxUzZtJzA"},{"type":"inlineMath","value":"N = M \\times H","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"N=M×HN = M \\times HN=M×H","key":"vZg9EVe6mv"},{"type":"text","value":".)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"dL9vgFN1tg"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Note that this is an inaccurate approximation! A key property of interactive tasks is that the agent’s output -- the action that it takes -- may influence its next observation.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"gUlotbtCDK"}],"key":"VnW2WuHsGE"}],"key":"vlVsRmZxKV"}],"key":"PgG5bLgWe9"},{"type":"listItem","spread":true,"position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Use a SL algorithm ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"unUAGzD0Jn"},{"type":"inlineMath","value":"\\texttt{fit} : \\mathcal{D} \\mapsto \\tilde \\pi","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"fit:Dπ~\\texttt{fit} : \\mathcal{D} \\mapsto \\tilde \\pifit:Dπ~","key":"zGitxRUPVv"},{"type":"text","value":" to extract a policy ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"AeUEesKSW3"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"π~\\tilde \\piπ~","key":"DCAunLkwKC"},{"type":"text","value":" that approximates the expert policy.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"hZaZuzOlx2"}],"key":"kuYNYLOMuR"}],"key":"lM2KdOXn8N"}],"enumerator":"7.1","html_id":"behavioral-cloning","key":"Qqv98Fxssl"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"Typically, this second task can be framed as ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"b2zv6c7MY2"},{"type":"strong","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"empirical loss minimization","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"PpYAo0TXhv"}],"key":"wEO6r81XoP"},{"type":"text","value":":","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"vesWO6joHd"}],"key":"fxj4ue0hj7"},{"type":"math","value":"\\tilde \\pi = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)","html":"π~=argminπΠn=0N1loss(π(sn),an)\\tilde \\pi = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)π~=argπΠminn=0N1loss(π(sn),an)","enumerator":"7.1","key":"NEHPiVjfDp"},{"type":"paragraph","position":{"start":{"line":57,"column":1},"end":{"line":60,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"tjVBRiz8OM"},{"type":"text","value":"Π","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"cC7rJoN50G"},{"type":"text","value":" is some class of possible policies, ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"K8jlx0rw5p"},{"type":"inlineMath","value":"\\text{loss}","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"loss\\text{loss}loss","key":"WF6FZvCuxO"},{"type":"text","value":" is the loss function to measure how far off the policy’s prediction is, and the SL algorithm tells us how to compute this ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"FU8zR8Y4LK"},{"type":"inlineMath","value":"\\arg\\min","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"argmin\\arg\\minargmin","key":"gs9AEWlznE"},{"type":"text","value":".\nIf training a deterministic policy that is just a function from inputs to outputs with no randomness, we might try to minimize the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"ekcvI4J5gE"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"mean squared error","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"OajRx00tRw"}],"key":"DuJEsxgDO8"},{"type":"text","value":".\nMore generally, though, we often choose the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"yDba0XDiPb"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"negative log likelihood","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"IfxSTiy9jD"}],"key":"wtRPCMWLXa"},{"type":"text","value":" as our loss function, so that the optimization is equivalent to ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"spy78n0s1f"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"maximum likelihood estimation","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"RWpom3CdP9"}],"key":"ZUKuXlHfsm"},{"type":"text","value":":\nout of the space of all possible mappings, we search for the one according to which the training dataset is the most likely.","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"VSwAOhZxWQ"}],"key":"I9R8PK3sy6"},{"type":"math","value":"\\tilde \\pi = \\arg\\max_{\\pi \\in \\Pi} \\pr_{a^n \\sim \\pi(s^n)}(a^{0:N} \\mid s^{0:N})","html":"π~=argmaxπΠPanπ(sn)(a0:Ns0:N)\\tilde \\pi = \\arg\\max_{\\pi \\in \\Pi} \\pr_{a^n \\sim \\pi(s^n)}(a^{0:N} \\mid s^{0:N})π~=argπΠmaxPanπ(sn)(a0:Ns0:N)","enumerator":"7.2","key":"akh00Gigph"},{"type":"paragraph","position":{"start":{"line":66,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Can we quantify how well this algorithm works?\nFor simplicity, let’s consider the case where the action space is discrete and both the data and trained policy are deterministic.\n(This corresponds to a classification task in SL.)\nSuppose the SL algorithm obtains ","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"ZKV6A2HTDP"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"html":"ε\\varepsilonε","key":"VAOh0Tl49V"},{"type":"text","value":" classification error.\nThat is, for trajectories drawn from the expert policy,\nthe learned policy chooses a different action at most ","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"gwccsB9bnj"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"html":"ε\\varepsilonε","key":"m3PhHXxSPj"},{"type":"text","value":" of the time:","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"gHwJBWcuAY"}],"key":"qoyCmV1ZkX"},{"type":"math","value":"\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{data}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\tilde \\pi(s_\\hi) \\ne \\pi_{\\text{data}} (s_\\hi) } \\right] \\le \\varepsilon","html":"Eτρπdata[1Hh=0H11{π~(sh)πdata(sh)}]ε\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{data}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\tilde \\pi(s_\\hi) \\ne \\pi_{\\text{data}} (s_\\hi) } \\right] \\le \\varepsilonEτρπdata[H1h=0H11{π~(sh)=πdata(sh)}]ε","enumerator":"7.3","key":"JapqNvqqyP"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Then, their value functions differ by","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"rQQcvoL2dx"}],"key":"JilhKsyFmh"},{"type":"math","value":"| V^{\\pi_{\\text{data}}} - V^{\\tilde \\pi} | \\le H^2 \\varepsilon","html":"VπdataVπ~H2ε| V^{\\pi_{\\text{data}}} - V^{\\tilde \\pi} | \\le H^2 \\varepsilonVπdataVπ~H2ε","enumerator":"7.4","key":"Xh4FfHYkcc"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"FmAlkedGmG"},{"type":"inlineMath","value":"H","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"HHH","key":"qLLQ1Afb9L"},{"type":"text","value":" is the horizon.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"lwlBsMJBHb"}],"key":"b7uUNdqsH4"},{"type":"proof","kind":"theorem","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance of behavioral cloning","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"y8XchgvHJc"}],"key":"HwZabIwGUy"},{"type":"paragraph","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"GAzZOVwp2P"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"jYgIaIx9Dg"},{"type":"text","value":"6.1","key":"TDsSgBYhdX"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","remote":true,"url":"/pg","dataUrl":"/pg.json","key":"tfkWlO4dmH"},{"type":"text","value":" allows us to express the difference between ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"wzzzPHVcqk"},{"type":"inlineMath","value":"\\pi_{\\text{data}}","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"html":"πdata\\pi_{\\text{data}}πdata","key":"SpjYCGalyi"},{"type":"text","value":" and ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"N3iCiTdjLw"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"html":"π~\\tilde \\piπ~","key":"GP37lQfihj"},{"type":"text","value":" as","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"Sytr8emzRd"}],"key":"gZVuQeDraM"},{"type":"math","value":"V_0^{\\pi_{\\text{data}}}(s) - V_0^{\\tilde \\pi} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{data}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\tilde \\pi} (s_\\hi, a_\\hi) \\right].","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"V0πdata(s)V0π~(s)=Eτρπdatas0=s[h=0H1Ahπ~(sh,ah)].V_0^{\\pi_{\\text{data}}}(s) - V_0^{\\tilde \\pi} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{data}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\tilde \\pi} (s_\\hi, a_\\hi) \\right].V0πdata(s)V0π~(s)=Eτρπdatas0=s[h=0H1Ahπ~(sh,ah)].","enumerator":"7.5","key":"lypGqarzgg"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"text","value":"Now since the data policy is deterministic, we can substitute ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"u1cKdJkW4H"},{"type":"inlineMath","value":"a_\\hi = \\pi_{\\text{data}}(s_\\hi)","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"ah=πdata(sh)a_\\hi = \\pi_{\\text{data}}(s_\\hi)ah=πdata(sh)","key":"xeSJ2xQiKx"},{"type":"text","value":".\nThis allows us to make a further simplification:\nsince ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"OHzxKksRDO"},{"type":"inlineMath","value":"\\pi_{\\text{data}}","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"πdata\\pi_{\\text{data}}πdata","key":"As1VXCtcDF"},{"type":"text","value":" is deterministic, we have","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"cN22EzbAlk"}],"key":"LgNPFvBj7Q"},{"type":"math","value":"A^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) = Q^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) - V^{\\pi_{\\text{data}}}(s) = 0.","position":{"start":{"line":97,"column":1},"end":{"line":99,"column":1}},"html":"Aπdata(s,πdata(s))=Qπdata(s,πdata(s))Vπdata(s)=0.A^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) = Q^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) - V^{\\pi_{\\text{data}}}(s) = 0.Aπdata(s,πdata(s))=Qπdata(s,πdata(s))Vπdata(s)=0.","enumerator":"7.6","key":"I8nDk4SeFV"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"Now we can use the assumption that the SL algorithm obtains ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"u01FihDZbh"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"ε\\varepsilonε","key":"sSmgQtZs2E"},{"type":"text","value":" classification error. By the above, ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"r6fZI9igoh"},{"type":"inlineMath","value":"A_\\hi^{\\tilde \\pi}(s_\\hi, \\pi_{\\text{data}}(s_\\hi)) = 0","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"Ahπ~(sh,πdata(sh))=0A_\\hi^{\\tilde \\pi}(s_\\hi, \\pi_{\\text{data}}(s_\\hi)) = 0Ahπ~(sh,πdata(sh))=0","key":"X1sA3tl6Hi"},{"type":"text","value":" when ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"gRDoqQUUXh"},{"type":"inlineMath","value":"\\pi_{\\text{data}}(s_\\hi) = \\tilde \\pi(s_\\hi)","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"πdata(sh)=π~(sh)\\pi_{\\text{data}}(s_\\hi) = \\tilde \\pi(s_\\hi)πdata(sh)=π~(sh)","key":"dw8g9uJaqa"},{"type":"text","value":". In the case where the two policies differ on ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"prDADnAiHK"},{"type":"inlineMath","value":"s_\\hi","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"shs_\\hish","key":"OA3zdxo4Yd"},{"type":"text","value":", which occurs with probability ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"CYPfV12peO"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"ε\\varepsilonε","key":"qOLZ8kkTQf"},{"type":"text","value":", the advantage is naively upper bounded by ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"GxI0fUmzSu"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"uDcu3f1dwz"},{"type":"text","value":" (assuming rewards are bounded between ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"CTMMv2EcU7"},{"type":"text","value":"0","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"HAUGvAwRDD"},{"type":"text","value":" and ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"eHaHWL8Q8i"},{"type":"text","value":"1","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"rptjfJSUIq"},{"type":"text","value":"). Taking the final sum gives the desired bound.","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"siAUJpThWk"}],"key":"Gyqwv9BlHa"}],"enumerator":"7.1","key":"iYuXkfm6wm"},{"type":"comment","value":" TODO ADD DISTRIBUTION SHIFT EXAMPLE FROM SLIDES ","key":"Taw695tfSQ"},{"type":"heading","depth":2,"position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Distribution shift","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"nU9LKCjcVw"}],"identifier":"distribution-shift","label":"Distribution shift","html_id":"distribution-shift","implicit":true,"enumerator":"7.3","key":"IBgd17hZyD"},{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned.\nThis is the issue of ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"as8mnFXYTq"},{"type":"emphasis","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"distribution shift","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"RmllCBxOFv"}],"key":"nSzzkXEauD"},{"type":"text","value":": a policy learned under some distribution of states may not perform well if this distribution changes.","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"FpRQHrY1Sy"}],"key":"jV0E40BIg4"},{"type":"paragraph","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"children":[{"type":"text","value":"This is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed. In interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behaviour; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"JnmoLoO9Xw"}],"key":"lHE3IlQmbo"},{"type":"paragraph","position":{"start":{"line":113,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"text","value":"How could you learn a strategy for these new settings?\nIn the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way.\nThen the next time you go for a drive, you can remember the expert’s advice, and take a safer route.\nYou could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations.\nThis is the key idea behind ","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"goWvkPi2AU"},{"type":"emphasis","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"dIYAmmX4bo"}],"key":"zHtvhXCAgr"},{"type":"text","value":".","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"zV4YzP8zG8"}],"key":"mfCZpQdqpQ"},{"type":"heading","depth":2,"position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"children":[{"type":"text","value":"Dataset aggregation (DAgger)","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"blZvFAU3RC"}],"identifier":"dataset-aggregation-dagger","label":"Dataset aggregation (DAgger)","html_id":"dataset-aggregation-dagger","implicit":true,"enumerator":"7.4","key":"oQXKG3nY2e"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"The DAgger algorithm is due to ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"dWQfNx6Wie"},{"type":"cite","kind":"narrative","label":"ross_reduction_2010","identifier":"ross_reduction_2010","children":[{"type":"text","value":"Ross ","key":"w6TKNrKqtP"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"jcLVHPOIwO"}],"key":"QM2UU8engx"},{"type":"text","value":" (2010)","key":"GPIGtk8qB0"}],"enumerator":"1","key":"xr4SUNYGnC"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"jLC1yXGJ3P"}],"key":"dv5PJUiiv3"},{"type":"code","lang":"python","value":"def dagger_pseudocode(\n env: MAB,\n π_init: Policy,\n π_expert: Policy,\n n_dagger_iterations: int,\n n_trajectories_per_iteration: int\n):\n π = π_init\n dataset = set()\n\n for _ in range(n_dagger_iterations):\n for __ in range(n_trajectories_per_iteration):\n τ = collect_trajectory(π, env)\n for step in range(env.H):\n obs = τ.state[step]\n τ.action[step] = π_expert(obs)\n dataset.add(τ)\n \n π = fit(dataset)\n \n return π","position":{"start":{"line":123,"column":1},"end":{"line":145,"column":1}},"key":"pX8GIbEzMl"},{"type":"paragraph","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"How well does DAgger perform?","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"dg8llzYhfa"}],"key":"AYLCeONG4A"},{"type":"comment","value":" TODO ","key":"uAlI2xSmDS"}],"key":"FQmos3Rs3x"}],"key":"GsBnDRtkdy"},"references":{"cite":{"order":["ross_reduction_2010"],"data":{"ross_reduction_2010":{"label":"ross_reduction_2010","enumerator":"1","html":"Ross, S., Gordon, G. J., & Bagnell, J. (2010, November). A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning. International Conference on Artificial Intelligence and Statistics."}}}},"footer":{"navigation":{"prev":{"title":"6 Policy Gradient Methods","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"8 Tree Search Methods","url":"/planning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/index.html b/index.html index 161b91a..9856e8f 100644 --- a/index.html +++ b/index.html @@ -14,34 +14,34 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

Introduction

Welcome to the study of reinforcement learning! This textbook accompanies the undergraduate course CS 1840/STAT 184 taught at Harvard. -It is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.

1Prerequisites

This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability. +It is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.

1Prerequisites

This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability. For Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents. Stat 111 is strongly recommended but not required. Specifically, we will assume that you know the following topics. The italicized terms have brief re-introductions in the text or in the Appendix: Background:

  • Linear Algebra: Vectors and matrices, matrix multiplication, matrix inversion, eigenvalues and eigenvectors.
  • Multivariable Calculus: Partial derivatives, the chain rule, Taylor series, gradients, directional derivatives, Lagrange multipliers.
  • Probability: Random variables, probability distributions, expectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.

You should also be comfortable with programming in Python. -See Section 6 for more about this textbook’s philosophy regarding programming.

2Reinforcement learning in a nutshell

Broadly speaking, +See Section 6 for more about this textbook’s philosophy regarding programming.

2Reinforcement learning in a nutshell

Broadly speaking, RL studies sequential decision-making in dynamic environments. An RL algorithm finds a strategy, called a policy, that maximizes the reward it obtains from the environment.

RL provides a powerful framework for attacking a wide variety of problems, including robotic control, video games and board games, resource management, language modelling, and more. It also provides an interdisciplinary paradigm for studying animal and human behavior. -Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.

How does RL compare to the other two core machine learning paradigms, +Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.

How does RL compare to the other two core machine learning paradigms, supervised learning and unsupervised learning?

  • Supervised learning (SL) concerns itself with learning a mapping from inputs to outputs. Typically the data takes the form of statistically independent input-output pairs. In RL, however, the data is generated by the agent interacting with the environment, meaning the sequential observations of the state are not independent from each other.

    Conversely, SL is a well-studied field that provides many useful tools for RL.

  • Unsupervised learning concerns itself with learning the structure of data without the use of outside feedback or labels. In RL, though, the agent receives a reward signal from the environment, -which can be thought of as a sort of feedback.

    Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.

3Core tasks of reinforcement learning

What tasks, exactly, does RL comprise? +which can be thought of as a sort of feedback.

Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.

3Core tasks of reinforcement learning

What tasks, exactly, does RL comprise? An RL algorithm must typically solve two main subtasks:

  • Policy evaluation (prediction): How ‘good’ is a specific state, or state-action pair (under a given policy)? That is, how much reward does it lead to in the long run?

  • Policy optimization (control): Suppose we fully understand how the environment behaves. -What is the best action to take in every scenario?

4Course overview

The course will progress through the following units:

1 Markov Decision Processes introduces Markov Decision Processes, +What is the best action to take in every scenario?

4Course overview

The course will progress through the following units:

1 Markov Decision Processes introduces Markov Decision Processes, the core mathematical framework for describing a large class of interactive environments.

2 Linear Quadratic Regulators is a standalone chapter on the linear quadratic regulator (LQR), an important tool for continuous control, in which the state and action spaces are no longer finite but rather continuous. @@ -49,16 +49,16 @@ In exploring a number of algorithms, we will see how each of them strikes a different balance between exploring new options and exploiting known options. This exploration-exploitation tradeoff is a core consideration in RL algorithm design.

4 Supervised learning is a standalone crash course on some tools from supervised learning that we will use in later chapters.

5 Fitted Dynamic Programming Algorithms introduces fitted dynamic programming (fitted DP) algorithms for solving MDPs. -These algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.

6 Policy Optimization explores an important class of algorithms based on iteratively improving a policy. +These algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.

6 Policy Gradient Methods explores an important class of algorithms based on iteratively improving a policy. We will also encounter the use of deep neural networks to express more complicated policies and approximate complicated functions.

7 Imitation Learning attempts to learn a good policy from expert demonstrations. -At its most basic, this is an application of supervised learning to RL tasks.

8 Planning looks at ways to explicitly plan ahead when the environment’s dynamics are known. +At its most basic, this is an application of supervised learning to RL tasks.

8 Tree Search Methods looks at ways to explicitly plan ahead when the environment’s dynamics are known. We will study the Monte Carlo Tree Search heuristic, which has been used to great success in the famous AlphaGo algorithm and its successors.

9 Exploration in MDPs continues to investigate the exploration-exploitation tradeoff. -We will extend ideas from multi-armed bandits to the MDP setting.

Appendix: Background contains an overview of selected background mathematical content and programming content.

5Notation

We will use the following notation throughout the book. +We will extend ideas from multi-armed bandits to the MDP setting.

Appendix: Background contains an overview of selected background mathematical content and programming content.

5Notation

We will use the following notation throughout the book. This notation is inspired by Sutton & Barto (2018) and Agarwal et al. (2022). We use [N][N] as shorthand for the set {0,1,,N1}\{ 0, 1, \dots, N-1 \}.

ElementSpaceDefinition (of element)
ssS\mathcal{S}A state.
aaA\mathcal{A}An action.
rrA reward.
γA discount factor.
τT\mathcal{T}A trajectory.
πΠA policy.
VπV^\piSR\mathcal{S} \to \mathbb{R}The value function of policy π.
QπQ^\piS×AR\mathcal{S} \times \mathcal{A} \to \mathbb{R}The action-value function (a.k.a. Q-function) of policy π.
AπA^\piS×AR\mathcal{S} \times \mathcal{A} \to \mathbb{R}The advantage function of policy π.
(X)\triangle(\mathcal{X})A distribution supported on X\mathcal{X}.
h\hi[H][\hor]Time horizon index of an MDP (subscript).
kk[K][K]Arm index of a multi-armed bandit (superscript).
tt[T][T]Iteration index of an algorithm (subscript).
θΘA set of parameters.

Note that throughout the text, certain symbols will stand for either random variables or fixed values. We aim to clarify in ambiguous settings. -Be warned that

6Programming

Why include code in a textbook? +Be warned that

6Programming

Why include code in a textbook? We believe that implementing an algorithm is a strong test of your understanding of it; mathematical notation can often abstract away details, while a computer must be given every single instruction. @@ -71,7 +71,7 @@ It uses the JAX library for numerical computing. JAX was chosen for the clarity of its functional style and due to its mature RL ecosystem, sustained in large part by the Google DeepMind research group and a large body of open-source contributors. -We use the standard Gymnasium library for interfacing with RL environments.

The following names are exported from the utils module:

import matplotlib.pyplot as plt
+We use the standard Gymnasium library for interfacing with RL environments.

The following names are exported from the utils module:

import matplotlib.pyplot as plt
 
 # convenient class builder
 from typing import NamedTuple
@@ -92,9 +92,9 @@
 # print functions as latex
 import latexify
 
-plt.style.use("fivethirtyeight")
References
  1. Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction (Second edition). The MIT Press.
  2. Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms.
  3. Sussman, G. J., Wisdom, J., & Farr, W. (2013). Functional Differential Geometry. The MIT Press.
\ No newline at end of file diff --git a/index.json b/index.json index 83c87c4..0f98eeb 100644 --- a/index.json +++ b/index.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"2cdeee9bc604ea0150aa2ba9d0d7b73c09784f007761496df1c2715f83d28614","slug":"index","location":"/index.md","dependencies":[],"frontmatter":{"title":"Introduction","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"index.md","url":"/build/index-b84d1d5a6390c0b2f1723ee4aeac02d1.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":16,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Welcome to the study of reinforcement learning!\nThis textbook accompanies the undergraduate course ","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"U50RdvhTmZ"},{"type":"link","url":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"CS 1840/STAT 184","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"mnzqmkI8sT"}],"urlSource":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","key":"qnD3Ep6Wi2"},{"type":"text","value":" taught at Harvard.\nIt is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"eXukxH0SYS"}],"key":"JnPOeLjq8g"}],"key":"U1bvN3bz1S"},{"type":"block","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Prerequisites","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"CNDTA5Sw4W"}],"identifier":"prerequisites","label":"Prerequisites","html_id":"prerequisites","implicit":true,"enumerator":"1","key":"YQoIQpREnQ"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability.\nFor Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents.\nStat 111 is strongly recommended but not required.\nSpecifically, we will assume that you know the following topics. The ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"ii7fD5OaV2"},{"type":"emphasis","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"italicized terms","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"GcCpBe2bt7"}],"key":"CDmpd56vYx"},{"type":"text","value":" have brief re-introductions in the text or in the ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"iPNCf9EMdS"},{"type":"link","url":"/background","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"zQ5MFuF0Ms"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"I7H0Iw3UZN"},{"type":"text","value":":","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"q8tvQvbNZh"}],"key":"JIt5FbWE9K"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":29,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":29,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Linear Algebra:","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"wKZx529XV7"}],"key":"F1KveDahzf"},{"type":"text","value":" Vectors and matrices, matrix multiplication, matrix\ninversion, eigenvalues and eigenvectors.","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"E3SySP5za7"}],"key":"aFLXLgDIrJ"},{"type":"listItem","spread":true,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"strong","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Multivariable Calculus:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"aOo4kxAwni"}],"key":"oPpZgcRr31"},{"type":"text","value":" Partial derivatives, the chain rule, Taylor series, ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"OBqXzuqFPI"},{"type":"emphasis","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"gradients, directional derivatives, Lagrange multipliers.","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"safObjYhUI"}],"key":"W7d8ujeCnw"}],"key":"HRlqZFQYlz"},{"type":"listItem","spread":true,"position":{"start":{"line":32,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"strong","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"children":[{"type":"text","value":"Probability:","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"TOLLVFzhYm"}],"key":"JZbGwsZhji"},{"type":"text","value":" Random variables, probability distributions,\nexpectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"lP2ZJw9nEU"}],"key":"eY8iMM2Nez"}],"key":"h2tf09Mzpl"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"You should also be comfortable with programming in Python.\nSee ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"OnbQ68zCWw"},{"type":"crossReference","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Section ","key":"Q2Xqt7AHJ4"},{"type":"text","value":"6","key":"gp5C35VGec"}],"identifier":"programming","label":"programming","kind":"heading","template":"Section %s","enumerator":"6","resolved":true,"html_id":"programming","key":"cJn5x18liI"},{"type":"text","value":" for more about this textbook’s philosophy regarding programming.","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"G8CBxFXBBm"}],"key":"jHWVqSBKrl"}],"key":"Vh7i3ckg2A"},{"type":"block","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Reinforcement learning in a nutshell","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"p7erLw07I1"}],"identifier":"reinforcement-learning-in-a-nutshell","label":"Reinforcement learning in a nutshell","html_id":"reinforcement-learning-in-a-nutshell","implicit":true,"enumerator":"2","key":"djwcs46D4p"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Broadly speaking,\nRL studies ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"RO0G7P0jSH"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"sequential decision-making","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hTiHLSwICV"}],"key":"V96a8HxxTM"},{"type":"text","value":" in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Jw6WaKdeHl"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"dynamic environments.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"bhBCZyZIud"}],"key":"AKKedapDEJ"},{"type":"text","value":"\nAn RL algorithm finds a strategy, called a ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"RW2iTmoAnB"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"policy,","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"irSbHcmgBi"}],"key":"umvzWL9ZQ4"},{"type":"text","value":" that maximizes the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"T74jUZRYkZ"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"eUlRmWze20"}],"key":"dYZJ5Cm0Ct"},{"type":"text","value":" it obtains from the environment.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Kqiwo3IvAi"}],"key":"nPogZU27WW"},{"type":"paragraph","position":{"start":{"line":46,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"RL provides a powerful framework for attacking a wide variety of problems,\nincluding robotic control, video games and board games, resource management, language modelling, and more.\nIt also provides an interdisciplinary paradigm for studying animal and human behavior.\nMany of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"bDvZNcOO4I"}],"key":"uLaMhhBWHg"}],"key":"eNt9zrs3nS"},{"type":"block","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":53,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"How does RL compare to the other two core machine learning paradigms,\n","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"PwtSZINt4R"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"LzHgFXfs0K"}],"key":"tP91dhFM6X"},{"type":"text","value":" and ","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"rJen7GXD4y"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"unsupervised learning?","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"Aa09rgq4tr"}],"key":"CK4aVTk1LV"}],"key":"K6DCWwA60Q"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":56,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":56,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"strong","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"Supervised learning","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"HqXf8snX8u"}],"key":"YbgRMz4zpp"},{"type":"text","value":" (SL) concerns itself with learning a mapping from inputs to outputs.\nTypically the data takes the form of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"VoFX7rB9OJ"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"statistically independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"SwWY1oBXmp"}],"key":"ouiNufA64m"},{"type":"text","value":" input-output pairs.\nIn RL, however, the data is generated by the agent interacting with the environment,\nmeaning the sequential observations of the state are ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"WuvJTcur7f"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"not independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"LJsmqfBPFT"}],"key":"C1cVCTPWHA"},{"type":"text","value":" from each other.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"tmOwKHfdWe"}],"key":"RJHQdi5wu3"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"Conversely, SL is a well-studied field that provides many useful tools for RL.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"w9cbz3TkNW"}],"key":"lF3EKScFcs"}],"key":"pp3U0Vm1UW"},{"type":"listItem","spread":true,"position":{"start":{"line":63,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":63,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"Unsupervised learning","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"TWOvNxBOfS"}],"key":"weik5SnRTo"},{"type":"text","value":" concerns itself with learning the ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"eGiFpgXXGl"},{"type":"emphasis","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"structure","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"Zju85Vg9EC"}],"key":"T3SHZxWkru"},{"type":"text","value":" of data without the use of outside feedback or labels.\nIn RL, though, the agent receives a ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"CtqYZP8SLR"},{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"reward signal","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"x115LsRgeX"}],"key":"lVKcyQ7kn8"},{"type":"text","value":" from the environment,\nwhich can be thought of as a sort of feedback.","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"Ltej8dFmEe"}],"key":"IptPO7ITK1"},{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"RXEFFmBoxh"}],"key":"lxYiDk0T2m"}],"key":"jq7TCtQvTR"}],"key":"SuuYLK4tDT"}],"key":"b4RXuJUy9o"},{"type":"block","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Core tasks of reinforcement learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"I04V7R4BHL"}],"identifier":"core-tasks-of-reinforcement-learning","label":"Core tasks of reinforcement learning","html_id":"core-tasks-of-reinforcement-learning","implicit":true,"enumerator":"3","key":"jXfzL3udiX"},{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"What tasks, exactly, does RL comprise?\nAn RL algorithm must typically solve two main subtasks:","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"K95ejyxXC4"}],"key":"dVgIztKDPs"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":76,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":76,"column":1},"end":{"line":79,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Policy evaluation (prediction):","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"EVm21DWUUH"}],"key":"ehj7E7GFfw"},{"type":"text","value":"\nHow ‘good’ is a specific state, or state-action pair (under a given policy)?\nThat is, how much reward does it lead to in the long run?","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"YXuOAKBxd4"}],"key":"BVGPPaZ2tc"}],"key":"VHZO7P3xDm"},{"type":"listItem","spread":true,"position":{"start":{"line":80,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Policy optimization (control):","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"JX1bJQMk7h"}],"key":"K5ePsW2jy3"},{"type":"text","value":"\nSuppose we fully understand how the environment behaves.\nWhat is the best action to take in every scenario?","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"RgD4lhn7gp"}],"key":"kUK9DjtJQ0"}],"key":"R2QkJZi9xM"}],"key":"xyYczIfvhf"},{"type":"comment","value":" **Recursion (bootstrapping):** How can we \"reuse\" our current predictions to generate new information? ","key":"b8xI6yqDly"},{"type":"comment","value":" **Exploration-exploitation tradeoff:** Should we try new actions, or capitalize on actions that we currently believe to be good? ","key":"tzXYGfdsC7"}],"key":"Lmm8OVBExh"},{"type":"block","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"Course overview","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"JvDRwEWpnV"}],"identifier":"course-overview","label":"Course overview","html_id":"course-overview","implicit":true,"enumerator":"4","key":"SdVJNiMCX5"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The course will progress through the following units:","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"wpfOiWzHe7"}],"key":"S2b2qZCtjf"},{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"link","url":"/mdps","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"ZZKQ4ww8Xk"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"jNJq2HMUvW"},{"type":"text","value":" introduces ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"JXXRa2Fmsn"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"Markov Decision Processes,","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"AQ2a7anIqM"}],"key":"Slu9LSPG2f"},{"type":"text","value":"\nthe core mathematical framework for describing a large class of interactive environments.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"PsEmXEBnrY"}],"key":"ESu0uaaofe"},{"type":"paragraph","position":{"start":{"line":97,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"link","url":"/control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"2 Linear Quadratic Regulators","key":"XK6CrBAHpd"}],"urlSource":"./control.md","dataUrl":"/control.json","internal":true,"protocol":"file","key":"oC5MfYUKZF"},{"type":"text","value":" is a standalone chapter on the ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"MjY3Tt2LLi"},{"type":"strong","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"L81KpnXJAv"}],"key":"f1laiKxUMn"},{"type":"text","value":" (LQR),\nan important tool for ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"n01Xg2EEqV"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"np9S50JPnI"}],"key":"bcPEBiXUzN"},{"type":"text","value":",\nin which the state and action spaces are no longer ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"lkvEfE5Bnu"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"UxM5vOr9zy"}],"key":"EsIHOgcYLL"},{"type":"text","value":" but rather ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"YJdFfAzdht"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"aIIZdzwCzJ"}],"key":"gsnw1V2dcX"},{"type":"text","value":".\nThis has widespread applications in robotics.","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"tb07o2qKqK"}],"key":"RJ06e79ndZ"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":105,"column":1}},"children":[{"type":"link","url":"/bandits","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"FdP9RIxewD"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"d3LvWz7Xly"},{"type":"text","value":" introduces the ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"mKo4uGD0vM"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"MY58lE8Nu7"}],"key":"zNBISgb6VC"},{"type":"text","value":" (MAB) model for ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"WwCEaRVIrk"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"stateless","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"TXJ1fiSfqv"}],"key":"h1AF6WlDXR"},{"type":"text","value":" sequential decision-making tasks.\nIn exploring a number of algorithms,\nwe will see how each of them strikes a different balance between ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"Fw7b0hwFIJ"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploring","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"EcqkUj1o7c"}],"key":"Ahl2mu9nNl"},{"type":"text","value":" new options and ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"Lpw5zMBxTs"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploiting","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"VhFgXOuX6G"}],"key":"y7EaCrIZIm"},{"type":"text","value":" known options.\nThis ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"Zf01RB7slZ"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploration-exploitation tradeoff","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"unHScsSdeO"}],"key":"CNBON1UxkJ"},{"type":"text","value":" is a core consideration in RL algorithm design.","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"fBBGmSSKqU"}],"key":"cELNd8yy3F"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"link","url":"/supervised-learning","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"kzT8yOpeSe"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"Wlr0DMJCBt"},{"type":"text","value":" is a standalone crash course on some tools from supervised learning that we will use in later chapters.","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"Z0hsTIWDxB"}],"key":"mURjqKue2Y"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"link","url":"/fitted-dp","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"5 Fitted Dynamic Programming Algorithms","key":"IRpMO7aeX4"}],"urlSource":"./fitted_dp.md","dataUrl":"/fitted-dp.json","internal":true,"protocol":"file","key":"Ul9IxV7Anl"},{"type":"text","value":" introduces ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"YdQHNLuAWA"},{"type":"strong","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"fitted dynamic programming","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"okhAoQXUgR"}],"key":"ydGEh20QfT"},{"type":"text","value":" (fitted DP) algorithms for solving MDPs.\nThese algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"tE9mtDK2X2"}],"key":"KYhB2gENk0"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"link","url":"/pg","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"6 Policy Optimization","key":"stSoRsK0fH"}],"urlSource":"./pg.md","dataUrl":"/pg.json","internal":true,"protocol":"file","key":"erZ9NR5D3Z"},{"type":"text","value":" explores an important class of algorithms based on iteratively improving a policy.\nWe will also encounter the use of ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"zQIBBTRale"},{"type":"emphasis","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"deep neural networks","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"GbRRMlL5Xy"}],"key":"wPB1KuQ72D"},{"type":"text","value":" to express more complicated policies and approximate complicated functions.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"BLFWZ6EB25"}],"key":"FagooFntfc"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"link","url":"/imitation-learning","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"7 Imitation Learning","key":"htp1AnKSpt"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"q615T7thdp"},{"type":"text","value":" attempts to learn a good policy from expert demonstrations.\nAt its most basic, this is an application of supervised learning to RL tasks.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"CGlN3kZcPY"}],"key":"q5WJ7zaI8j"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"link","url":"/planning","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"8 Planning","key":"uQ7HYrYDT0"}],"urlSource":"./planning.md","dataUrl":"/planning.json","internal":true,"protocol":"file","key":"RFUV5187XE"},{"type":"text","value":" looks at ways to ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"Xuf13Msy4A"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"ezv2qhtd1W"}],"key":"oguLuSa24k"},{"type":"text","value":" plan ahead when the environment’s dynamics are known.\nWe will study the ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"gabC4JNyvF"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"V0k7CpLbpj"}],"key":"bOXWSiS9eQ"},{"type":"text","value":" heuristic,\nwhich has been used to great success in the famous AlphaGo algorithm and its successors.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"vcYUXqMLvS"}],"key":"SPzDq3wzKv"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"link","url":"/exploration","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"9 Exploration in MDPs","key":"dnyJxtop9L"}],"urlSource":"./exploration.md","dataUrl":"/exploration.json","internal":true,"protocol":"file","key":"RC5Dvna1HA"},{"type":"text","value":" continues to investigate the exploration-exploitation tradeoff.\nWe will extend ideas from multi-armed bandits to the MDP setting.","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"donZTXUj4D"}],"key":"O37mtJPGng"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"link","url":"/background","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"aapnwXx6W2"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"KnrabjlMKm"},{"type":"text","value":" contains an overview of selected background mathematical content and programming content.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"xY12ES1IbL"}],"key":"s0UfjrlNhg"},{"type":"comment","value":" \n| Chapter | States | Actions | Rewards (or costs) |\n|:-------:|:------:|:-------:|:-------:|\n| [](#bandits) | N/A | Finite | Stochastic |\n| [](#mdps) | Finite | Finite | Deterministic |\n| [](#fitted_dp) | Large or continuous | Finite | Deterministic |\n| [](#lqr) | Continuous | Continuous | Deterministic |\n","key":"kZL8jQsnx7"}],"key":"GcDw6V6DMg"},{"type":"block","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"CFHFCwLbMq"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"5","key":"MnwFppallB"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"We will use the following notation throughout the book.\nThis notation is inspired by ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"bSQE7KNOJ9"},{"type":"cite","kind":"narrative","label":"sutton_reinforcement_2018","identifier":"sutton_reinforcement_2018","children":[{"type":"text","value":"Sutton & Barto (2018)","key":"iXBnLnfei5"}],"enumerator":"1","key":"VUh9ejFzYI"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"GYOqHXYwCI"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"BFxxf2ml1d"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"aEqvRC6gDP"}],"key":"bmH4alt59C"},{"type":"text","value":" (2022)","key":"UVIJGKoISr"}],"enumerator":"2","key":"gD4UegoEQR"},{"type":"text","value":".\nWe use ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"NI7P1izmZZ"},{"type":"inlineMath","value":"[N]","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"[N][N][N]","key":"Aa6bs3aowQ"},{"type":"text","value":" as shorthand for the set ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"VtTEFhr28J"},{"type":"inlineMath","value":"\\{ 0, 1, \\dots, N-1 \\}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"{0,1,,N1}\\{ 0, 1, \\dots, N-1 \\}{0,1,,N1}","key":"pzdGySvRJW"},{"type":"text","value":".","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"zcXlsOxz5a"}],"key":"gl3OwCO4Q5"},{"type":"table","position":{"start":{"line":144,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Element","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"ysAFWxMKtt"}],"key":"lBcjclQQHc"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Space","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"DAcv0EVF6X"}],"key":"DZnOu5ZeFY"},{"type":"tableCell","header":true,"align":"left","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Definition (of element)","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"NfVataC9B4"}],"key":"oy2Vl1M4OL"}],"key":"wI7kRhuxAx"},{"type":"tableRow","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"sss","key":"kgYtajNjSg"}],"key":"OwwXSs2Mba"},{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"S\\mathcal{S}S","key":"GbPukUUSQc"}],"key":"mDhf5MNjXq"},{"type":"tableCell","align":"left","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"A state.","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"key":"JSqxJwprXI"}],"key":"NmALx0KAUK"}],"key":"Yzl0l4eY8T"},{"type":"tableRow","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"aaa","key":"pZXlqm7QK5"}],"key":"TI5wa6yqYW"},{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"A\\mathcal{A}A","key":"Q4eBFqXOfy"}],"key":"T6KLeILfv8"},{"type":"tableCell","align":"left","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"An action.","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"OlZOrLPiuC"}],"key":"NuBl5IOI7Q"}],"key":"LGfpiBvg5b"},{"type":"tableRow","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"html":"rrr","key":"Cp6Kmc9ZlR"}],"key":"MyBb4SedQo"},{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[],"key":"t1oDVAaqqL"},{"type":"tableCell","align":"left","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"A reward.","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"hw6Cnj7B38"}],"key":"oQN3yP8YVC"}],"key":"eOVW34RjUq"},{"type":"tableRow","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"γ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"C6GfCsXu5u"}],"key":"BXlIllYim3"},{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[],"key":"VnN49Q7SM9"},{"type":"tableCell","align":"left","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"A discount factor.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"FRXifNC4PU"}],"key":"wlTk1RyH2B"}],"key":"P19dqP9XHX"},{"type":"tableRow","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"τ","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"mp9XbkbwBs"}],"key":"RJ3at05ooy"},{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{T}","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"html":"T\\mathcal{T}T","key":"bAs2i5XB2E"}],"key":"OACckM3wcm"},{"type":"tableCell","align":"left","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"A trajectory.","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"yyPFnTftxN"}],"key":"fDe1hYzNT5"}],"key":"ZnTlcMUMae"},{"type":"tableRow","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"iWr3zCsn8e"}],"key":"oaoqZkMPjw"},{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"kwqaudmeQf"}],"key":"vcsx9vFdsQ"},{"type":"tableCell","align":"left","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"A policy.","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"pua28TYSn1"}],"key":"QqffgNh9X4"}],"key":"Dx9F0Ytcvm"},{"type":"tableRow","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"VπV^\\piVπ","key":"THnJSzRVdp"}],"key":"UbPvJQQCo9"},{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"SR\\mathcal{S} \\to \\mathbb{R}SR","key":"EtRcuxXkjZ"}],"key":"zdnX4TxUpq"},{"type":"tableCell","align":"left","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"The value function of policy ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"uC4W0oBtff"},{"type":"text","value":"π","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"ZnJEIojln0"},{"type":"text","value":".","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"dajxqjbSDW"}],"key":"Mm5LjULj5p"}],"key":"fkj2kcefV6"},{"type":"tableRow","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"QπQ^\\piQπ","key":"lZwXfzikO7"}],"key":"ITbazvUlhE"},{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"YVoFrrEeub"}],"key":"G1GEzcZnw2"},{"type":"tableCell","align":"left","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"The action-value function (a.k.a. Q-function) of policy ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"FuGeXfx9uT"},{"type":"text","value":"π","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"Baob3b1YsW"},{"type":"text","value":".","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"Jyu8OAO8F8"}],"key":"oeYepUykmA"}],"key":"FhtPLsbrz4"},{"type":"tableRow","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"A^\\pi","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"AπA^\\piAπ","key":"OReOouCmJ0"}],"key":"eWuS1eezUh"},{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"a82dZv0AHk"}],"key":"RiiYG8g1oZ"},{"type":"tableCell","align":"left","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The advantage function of policy ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"hcggaUMdnK"},{"type":"text","value":"π","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"cO5J2AWVcJ"},{"type":"text","value":".","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"fXo1urSe9J"}],"key":"zwNxj2muZB"}],"key":"M3sO872FFs"},{"type":"tableRow","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[],"key":"DKC0vgDKWo"},{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"inlineMath","value":"\\triangle(\\mathcal{X})","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"(X)\\triangle(\\mathcal{X})(X)","key":"kaI6n0BCiH"}],"key":"NV1LHBh0sc"},{"type":"tableCell","align":"left","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"A distribution supported on ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"a8AFsDfp0Q"},{"type":"inlineMath","value":"\\mathcal{X}","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"X\\mathcal{X}X","key":"HBdmceG0jX"},{"type":"text","value":".","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"UxoonH6gf4"}],"key":"HVdyOOIUHc"}],"key":"wcLXtpaGif"},{"type":"tableRow","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"h\\hih","key":"KXMGYUCm7j"}],"key":"iVgz6ayU5f"},{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"[\\hor]","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"[H][\\hor][H]","key":"M6mBBjd6mI"}],"key":"DeabTgUiZe"},{"type":"tableCell","align":"left","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"Time horizon index of an MDP (subscript).","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"lABMb1yGNG"}],"key":"I3PRODUtXb"}],"key":"YSKrP21KDA"},{"type":"tableRow","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"k","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"kkk","key":"AjQwEkkhIU"}],"key":"dAdqYzTEhD"},{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"[K]","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"[K][K][K]","key":"D4d5QG5PlE"}],"key":"kCkGuCjljL"},{"type":"tableCell","align":"left","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"text","value":"Arm index of a multi-armed bandit (superscript).","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"gPFH01e0Kr"}],"key":"KwaKWVutl5"}],"key":"x9NoZ0URtW"},{"type":"tableRow","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"t","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"ttt","key":"ldHNzV3LNi"}],"key":"KhFtguEguI"},{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"[T]","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"[T][T][T]","key":"PNKFgoPcLa"}],"key":"Hq7MpTK1lw"},{"type":"tableCell","align":"left","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"text","value":"Iteration index of an algorithm (subscript).","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"uM8PE67CUc"}],"key":"bKVnXVYP9t"}],"key":"mkIT6qszGm"},{"type":"tableRow","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"K6YCKlXoaJ"}],"key":"mZoGJ382O2"},{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"Θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"Ze99Sk11C9"}],"key":"bwD3QyFMjO"},{"type":"tableCell","align":"left","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"A set of parameters.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"WJ0QfVDWgl"}],"key":"hzUQts58w9"}],"key":"dclXqhtxQn"}],"key":"iA27IoWeLe"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Note that throughout the text, certain symbols will stand for either random variables or fixed values.\nWe aim to clarify in ambiguous settings.\nBe warned that","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"Rf9wjezA2z"}],"key":"Z1YEhBdvNj"}],"key":"t4wD5ZGZsl"},{"type":"block","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"children":[{"type":"text","value":"Programming","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"na8JIRujDD"}],"label":"programming","identifier":"programming","html_id":"programming","enumerator":"6","key":"jFgNka7Ub8"},{"type":"paragraph","position":{"start":{"line":170,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"Why include code in a textbook?\nWe believe that implementing an algorithm is a strong test of your understanding of it;\nmathematical notation can often abstract away details,\nwhile a computer must be given every single instruction.\nWe have sought to write readable Python code that is self-contained within each file.\nThis approach is inspired by ","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"EtMi1AY5j4"},{"type":"cite","kind":"narrative","label":"sussman_functional_2013","identifier":"sussman_functional_2013","children":[{"type":"text","value":"Sussman ","key":"Aaxiob91Pa"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"ddqSS1h3uW"}],"key":"qW5KnAl5Qb"},{"type":"text","value":" (2013)","key":"PvavgpXSVa"}],"enumerator":"3","key":"fFLsZDQG65"},{"type":"text","value":".\nThere are some ways in which the code style differs from typical software projects:","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"mVhlTRURcY"}],"key":"mdbQ28juKe"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":178,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":178,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"We keep use of language features to a minimum,\neven if it leads to code that could otherwise be more concisely or idiomatically expressed.","position":{"start":{"line":178,"column":1},"end":{"line":178,"column":1}},"key":"YP9bsvBqgx"}],"key":"HpCGA6fXqM"},{"type":"listItem","spread":true,"position":{"start":{"line":180,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"The variable names used in the code match those used in the main text.\nFor example, the variable ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"SDfYhBfiGl"},{"type":"inlineCode","value":"s","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"uyAB7fHe4j"},{"type":"text","value":" will be used instead of the more explicit ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"C8PAQavgnU"},{"type":"inlineCode","value":"state","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"f20BuODFER"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"yP2W2SE89F"}],"key":"YL8uzV2QbR"}],"key":"GZ7vjnTbR1"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"We also make extensive use of Python ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"FFFNgI2omG"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"type annotations","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"LvEMvMuMmG"}],"key":"nBz3vFeOxL"},{"type":"text","value":" to explicitly specify variable types, including shapes of vectors and matrices using the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"b14kKMbWIL"},{"type":"link","url":"https://github.com/patrick-kidger/jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"iG564GyZF7"}],"urlSource":"https://github.com/patrick-kidger/jaxtyping","error":true,"key":"BggJEJfw2o"},{"type":"text","value":" library.","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"TXaaX8ClMe"}],"key":"BX3GdTUuYa"},{"type":"paragraph","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"This is an interactive book built with ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"ebbtdafW1b"},{"type":"link","url":"https://jupyterbook.org/en/stable/intro.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Jupyter Book","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"KWaSepac5s"}],"urlSource":"https://jupyterbook.org/en/stable/intro.html","key":"ENKHTbCD7O"},{"type":"text","value":".\nIt uses ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"SKoRnAUn4z"},{"type":"link","url":"https://docs.python.org/3.11/contents.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Python 3.11","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"uXzD7APjBB"}],"urlSource":"https://docs.python.org/3.11/contents.html","key":"Ske2DhIGre"},{"type":"text","value":".\nIt uses the ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"OEFize3fsd"},{"type":"link","url":"https://jax.readthedocs.io/en/latest/index.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"JAX","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"YbcDd5mVMW"}],"urlSource":"https://jax.readthedocs.io/en/latest/index.html","key":"e5ORPJvuXl"},{"type":"text","value":" library for numerical computing.\nJAX was chosen for the clarity of its functional style and due to its mature RL ecosystem,\nsustained in large part by the Google DeepMind research group and a large body of open-source contributors.\nWe use the standard ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"BWSnHSXCh6"},{"type":"link","url":"https://gymnasium.farama.org/","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Gymnasium","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"gpIXpzRvjZ"}],"urlSource":"https://gymnasium.farama.org/","key":"XJxyjCSDgK"},{"type":"text","value":" library for interfacing with RL environments.","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"dkRGaoORT9"}],"key":"QePGbZllQc"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"The following names are exported from the ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"ZFcf2tHZ7a"},{"type":"inlineCode","value":"utils","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"QNbXzpUEK4"},{"type":"text","value":" module:","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"TlCpzrQqWF"}],"key":"lNaxJf6iP4"},{"type":"code","lang":"python","value":"import matplotlib.pyplot as plt\n\n# convenient class builder\nfrom typing import NamedTuple\n\n# function typings\nfrom collections.abc import Callable\n\n# array typings\nfrom jaxtyping import Float, Array\n\n# convenient function composition\nfrom functools import partial\n\n# numerical computing and linear algebra\nimport jax\nimport jax.numpy as jnp\n\n# print functions as latex\nimport latexify\n\nplt.style.use(\"fivethirtyeight\")","position":{"start":{"line":194,"column":1},"end":{"line":217,"column":1}},"key":"dutMvl9DOK"}],"key":"DjpxQccfbH"}],"key":"nnnUESdZ26"},"references":{"cite":{"order":["sutton_reinforcement_2018","agarwal_reinforcement_2022","sussman_functional_2013"],"data":{"sutton_reinforcement_2018":{"label":"sutton_reinforcement_2018","enumerator":"1","html":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction (Second edition). The MIT Press."},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"2","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."},"sussman_functional_2013":{"label":"sussman_functional_2013","enumerator":"3","html":"Sussman, G. J., Wisdom, J., & Farr, W. (2013). Functional Differential Geometry. The MIT Press."}}}},"footer":{"navigation":{"next":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"2cdeee9bc604ea0150aa2ba9d0d7b73c09784f007761496df1c2715f83d28614","slug":"index","location":"/index.md","dependencies":[],"frontmatter":{"title":"Introduction","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"index.md","url":"/build/index-b84d1d5a6390c0b2f1723ee4aeac02d1.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":16,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Welcome to the study of reinforcement learning!\nThis textbook accompanies the undergraduate course ","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"eUY2p9j14c"},{"type":"link","url":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"CS 1840/STAT 184","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"LlDJakhJl9"}],"urlSource":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","key":"hjjMHDQ8vD"},{"type":"text","value":" taught at Harvard.\nIt is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"zUZniUjE01"}],"key":"c90F2YxMsI"}],"key":"SHf3lE39fc"},{"type":"block","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Prerequisites","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"H5Suu9L3s3"}],"identifier":"prerequisites","label":"Prerequisites","html_id":"prerequisites","implicit":true,"enumerator":"1","key":"RkiQSohnJZ"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability.\nFor Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents.\nStat 111 is strongly recommended but not required.\nSpecifically, we will assume that you know the following topics. The ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"jzmIr9ceh1"},{"type":"emphasis","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"italicized terms","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"dC2u25IzWZ"}],"key":"shiVYqF0OK"},{"type":"text","value":" have brief re-introductions in the text or in the ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"DsTxtreLNn"},{"type":"link","url":"/background","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"RRfh4emlqH"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"itvIchbW3K"},{"type":"text","value":":","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"YNEK2qJi1a"}],"key":"pfHqrqFrCI"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":29,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":29,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Linear Algebra:","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"FmqFc9cWqR"}],"key":"NGrt9Kx56U"},{"type":"text","value":" Vectors and matrices, matrix multiplication, matrix\ninversion, eigenvalues and eigenvectors.","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"X4jHXVUVhi"}],"key":"NJf9bvHqpg"},{"type":"listItem","spread":true,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"strong","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Multivariable Calculus:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"uQO5kgVikT"}],"key":"byNfZbMy6r"},{"type":"text","value":" Partial derivatives, the chain rule, Taylor series, ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"vnpcLOwnzd"},{"type":"emphasis","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"gradients, directional derivatives, Lagrange multipliers.","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"PgEVVDejsR"}],"key":"lDweUDDmGr"}],"key":"WAABdEVFgY"},{"type":"listItem","spread":true,"position":{"start":{"line":32,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"strong","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"children":[{"type":"text","value":"Probability:","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"xcYQqKPcSJ"}],"key":"PKqCCW9NuC"},{"type":"text","value":" Random variables, probability distributions,\nexpectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"LIYWnRe6pA"}],"key":"Yta3GpJpdT"}],"key":"Ji5JM43eS0"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"You should also be comfortable with programming in Python.\nSee ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"H7td5ppQYk"},{"type":"crossReference","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Section ","key":"MznmpEmnTA"},{"type":"text","value":"6","key":"bS4wQRrwSE"}],"identifier":"programming","label":"programming","kind":"heading","template":"Section %s","enumerator":"6","resolved":true,"html_id":"programming","key":"ZoaSIyGY7t"},{"type":"text","value":" for more about this textbook’s philosophy regarding programming.","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"G6CzETnpyL"}],"key":"uD0qrn7Vyw"}],"key":"yK3KASuhxj"},{"type":"block","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Reinforcement learning in a nutshell","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"PSuaTwgXuN"}],"identifier":"reinforcement-learning-in-a-nutshell","label":"Reinforcement learning in a nutshell","html_id":"reinforcement-learning-in-a-nutshell","implicit":true,"enumerator":"2","key":"sUns36yIDP"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Broadly speaking,\nRL studies ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"zKQFjqPVUy"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"sequential decision-making","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"fGUbp71vXZ"}],"key":"PbrBzPa7bu"},{"type":"text","value":" in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"ZHsEmDYYbd"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"dynamic environments.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"yHHqZHDrGR"}],"key":"Tf0LpmmbhG"},{"type":"text","value":"\nAn RL algorithm finds a strategy, called a ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"dUGZylQnFa"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"policy,","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hgD1QWMstb"}],"key":"cwOTsoJ18U"},{"type":"text","value":" that maximizes the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"qGa41FbGO0"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"YkoKYEofow"}],"key":"FUxRNjbUN5"},{"type":"text","value":" it obtains from the environment.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hxO5yXXpk0"}],"key":"hncwKUHLSM"},{"type":"paragraph","position":{"start":{"line":46,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"RL provides a powerful framework for attacking a wide variety of problems,\nincluding robotic control, video games and board games, resource management, language modelling, and more.\nIt also provides an interdisciplinary paradigm for studying animal and human behavior.\nMany of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"XgBbc1Apq5"}],"key":"zHBVFKAR1e"}],"key":"iawtpLo18y"},{"type":"block","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":53,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"How does RL compare to the other two core machine learning paradigms,\n","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"Lyynvd8iCQ"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"hDnDPyLIc8"}],"key":"Vj6pIIEJT9"},{"type":"text","value":" and ","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"JkmjByLTP9"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"unsupervised learning?","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"d2H1GNoIRM"}],"key":"jEpoKumYjQ"}],"key":"y6UGtKtMha"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":56,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":56,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"strong","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"Supervised learning","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"yyggOtJgd8"}],"key":"dtAlFWSz44"},{"type":"text","value":" (SL) concerns itself with learning a mapping from inputs to outputs.\nTypically the data takes the form of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"PAv77VBAgx"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"statistically independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"OFNjtaOZp8"}],"key":"csofffmavF"},{"type":"text","value":" input-output pairs.\nIn RL, however, the data is generated by the agent interacting with the environment,\nmeaning the sequential observations of the state are ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"C9PesWBCfG"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"not independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"VaTgCBpbjY"}],"key":"cqjAb4aPeX"},{"type":"text","value":" from each other.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"CDIE0Yk1pN"}],"key":"gJNEpnwHxe"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"Conversely, SL is a well-studied field that provides many useful tools for RL.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"IdCbMZmL1Y"}],"key":"winFA3SNnE"}],"key":"tRm5HhDZo5"},{"type":"listItem","spread":true,"position":{"start":{"line":63,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":63,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"Unsupervised learning","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"GBD11HvJ6u"}],"key":"fBBy5Tc8l7"},{"type":"text","value":" concerns itself with learning the ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"sDGxWSXxJF"},{"type":"emphasis","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"structure","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"zQu2a00rvy"}],"key":"R7YipiOry5"},{"type":"text","value":" of data without the use of outside feedback or labels.\nIn RL, though, the agent receives a ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"xhAUaveobb"},{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"reward signal","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"jpccG53sUI"}],"key":"pl9uB4B1ej"},{"type":"text","value":" from the environment,\nwhich can be thought of as a sort of feedback.","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"gxZAbhegWB"}],"key":"PYNEj8hBtN"},{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"eBfTV3K355"}],"key":"HF3cMuN7Ja"}],"key":"M1XMrElxU9"}],"key":"t8wtdtJ16T"}],"key":"wxK1jN9FSg"},{"type":"block","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Core tasks of reinforcement learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"RTBTaEkA9t"}],"identifier":"core-tasks-of-reinforcement-learning","label":"Core tasks of reinforcement learning","html_id":"core-tasks-of-reinforcement-learning","implicit":true,"enumerator":"3","key":"mHXIHjofet"},{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"What tasks, exactly, does RL comprise?\nAn RL algorithm must typically solve two main subtasks:","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"t7Vgetjqeq"}],"key":"Y5p0sLkaoE"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":76,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":76,"column":1},"end":{"line":79,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Policy evaluation (prediction):","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"m5rwjOr7Uv"}],"key":"CrO7089x9Q"},{"type":"text","value":"\nHow ‘good’ is a specific state, or state-action pair (under a given policy)?\nThat is, how much reward does it lead to in the long run?","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"fZUm0FsXOf"}],"key":"jldiRWVORE"}],"key":"nIwv8UxAA9"},{"type":"listItem","spread":true,"position":{"start":{"line":80,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Policy optimization (control):","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"aXG73C5WB1"}],"key":"cykdUU91b7"},{"type":"text","value":"\nSuppose we fully understand how the environment behaves.\nWhat is the best action to take in every scenario?","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"eQJx5UPqif"}],"key":"hFn8QAAAxd"}],"key":"E9BIeOkyMb"}],"key":"UQpwoteTgH"},{"type":"comment","value":" **Recursion (bootstrapping):** How can we \"reuse\" our current predictions to generate new information? ","key":"y6LWiWj9jk"},{"type":"comment","value":" **Exploration-exploitation tradeoff:** Should we try new actions, or capitalize on actions that we currently believe to be good? ","key":"QYbYpVUu8b"}],"key":"Nmn2Nm0C2x"},{"type":"block","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"Course overview","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"nNgMzrMNmr"}],"identifier":"course-overview","label":"Course overview","html_id":"course-overview","implicit":true,"enumerator":"4","key":"RIGkCbEu1C"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The course will progress through the following units:","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Xe7oc6Zz9g"}],"key":"larmpUUJmD"},{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"link","url":"/mdps","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"R61EoZXa5O"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"TCrbDf0vUY"},{"type":"text","value":" introduces ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"J2te4N9G3w"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"Markov Decision Processes,","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"RsDN87bvrj"}],"key":"S3DutmGupz"},{"type":"text","value":"\nthe core mathematical framework for describing a large class of interactive environments.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"hturBwU3mu"}],"key":"AS24pu7re4"},{"type":"paragraph","position":{"start":{"line":97,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"link","url":"/control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"2 Linear Quadratic Regulators","key":"TSAaKjGGJt"}],"urlSource":"./control.md","dataUrl":"/control.json","internal":true,"protocol":"file","key":"MCXLkP25Xl"},{"type":"text","value":" is a standalone chapter on the ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"HRXf1ndKCS"},{"type":"strong","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"FHNRFQ9eXc"}],"key":"VKpjbNrhom"},{"type":"text","value":" (LQR),\nan important tool for ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"siFFOnoyk7"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"N7LkIsr6OR"}],"key":"LkR8yQS2EC"},{"type":"text","value":",\nin which the state and action spaces are no longer ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"G1gjWJZRBj"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"UIcdLeOfJw"}],"key":"vnbTJGsEbJ"},{"type":"text","value":" but rather ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"AOvDtRP4VY"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"DJeVDqKFOY"}],"key":"eyjxpvClAo"},{"type":"text","value":".\nThis has widespread applications in robotics.","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"dWK7zGAkYG"}],"key":"PegdZLnu5b"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":105,"column":1}},"children":[{"type":"link","url":"/bandits","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"LzxLnPo4KZ"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"Kg3K8viBdw"},{"type":"text","value":" introduces the ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"dWRMeiodlm"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"peVUXxxKaX"}],"key":"aHOTDtfQia"},{"type":"text","value":" (MAB) model for ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"Su8WDIKZF4"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"stateless","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"EnJK0Bmj6T"}],"key":"v3I03gpl0F"},{"type":"text","value":" sequential decision-making tasks.\nIn exploring a number of algorithms,\nwe will see how each of them strikes a different balance between ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"k862r5i12Q"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploring","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"zWPLg3PPKb"}],"key":"oDRF1nyDyr"},{"type":"text","value":" new options and ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"T33CGDH90u"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploiting","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"L99XTx2Heh"}],"key":"k26XpK6KxG"},{"type":"text","value":" known options.\nThis ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"AZim3CBMEO"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploration-exploitation tradeoff","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"F5y8u39TK0"}],"key":"eRbHWd8SZA"},{"type":"text","value":" is a core consideration in RL algorithm design.","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"z8kWo5tjEU"}],"key":"q1jLmxqSjv"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"link","url":"/supervised-learning","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"ws9qdi4ICP"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"hKbOFup9Q7"},{"type":"text","value":" is a standalone crash course on some tools from supervised learning that we will use in later chapters.","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"s0AhfEiJ0J"}],"key":"H4qHnyScUb"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"link","url":"/fitted-dp","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"5 Fitted Dynamic Programming Algorithms","key":"i3udRNwAgN"}],"urlSource":"./fitted_dp.md","dataUrl":"/fitted-dp.json","internal":true,"protocol":"file","key":"O97OQ7cJDw"},{"type":"text","value":" introduces ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"PMsCJ7Ft8s"},{"type":"strong","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"fitted dynamic programming","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"p3GOUO9qsl"}],"key":"WFvQ1LN7JD"},{"type":"text","value":" (fitted DP) algorithms for solving MDPs.\nThese algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"MUnR4OBPiz"}],"key":"cVPtIuRFLl"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"link","url":"/pg","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"6 Policy Gradient Methods","key":"WXyw8QnmTC"}],"urlSource":"./pg.md","dataUrl":"/pg.json","internal":true,"protocol":"file","key":"nHHzb337aE"},{"type":"text","value":" explores an important class of algorithms based on iteratively improving a policy.\nWe will also encounter the use of ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"XqdEL4MP2V"},{"type":"emphasis","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"deep neural networks","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"ro06ID7f2V"}],"key":"ubVdj1vUxu"},{"type":"text","value":" to express more complicated policies and approximate complicated functions.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"d2Q8d2WRCZ"}],"key":"M5avcRxHTj"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"link","url":"/imitation-learning","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"7 Imitation Learning","key":"IjM22K3N6X"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"ETI6AaKK0F"},{"type":"text","value":" attempts to learn a good policy from expert demonstrations.\nAt its most basic, this is an application of supervised learning to RL tasks.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"fi35WR7Ue7"}],"key":"EuGJlbIoNm"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"link","url":"/planning","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"8 Tree Search Methods","key":"UruAMBcmQv"}],"urlSource":"./planning.md","dataUrl":"/planning.json","internal":true,"protocol":"file","key":"GLnVYTQWi1"},{"type":"text","value":" looks at ways to ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"GaRvZ6pBHh"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"duEPEDB1FO"}],"key":"tMvr2Q1itJ"},{"type":"text","value":" plan ahead when the environment’s dynamics are known.\nWe will study the ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AYhlZ8bfst"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"cHtwpoC1v6"}],"key":"sJsZVr8OLQ"},{"type":"text","value":" heuristic,\nwhich has been used to great success in the famous AlphaGo algorithm and its successors.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"FGKlExim9g"}],"key":"p5Jnk7V8kc"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"link","url":"/exploration","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"9 Exploration in MDPs","key":"f3swSA7HHK"}],"urlSource":"./exploration.md","dataUrl":"/exploration.json","internal":true,"protocol":"file","key":"pKefgwJr1r"},{"type":"text","value":" continues to investigate the exploration-exploitation tradeoff.\nWe will extend ideas from multi-armed bandits to the MDP setting.","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"k8LsVruKVD"}],"key":"hsUgUWu3Ll"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"link","url":"/background","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"e4yXMUAxhs"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"zNvFA7XAMl"},{"type":"text","value":" contains an overview of selected background mathematical content and programming content.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"E7HW7UB0lD"}],"key":"pAftcxiHwr"},{"type":"comment","value":" \n| Chapter | States | Actions | Rewards (or costs) |\n|:-------:|:------:|:-------:|:-------:|\n| [](#bandits) | N/A | Finite | Stochastic |\n| [](#mdps) | Finite | Finite | Deterministic |\n| [](#fitted_dp) | Large or continuous | Finite | Deterministic |\n| [](#lqr) | Continuous | Continuous | Deterministic |\n","key":"TAOEjFMuxi"}],"key":"RaAK75MEZ2"},{"type":"block","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"QUBfD1B3Az"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"5","key":"JPWst2Zq8R"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"We will use the following notation throughout the book.\nThis notation is inspired by ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"tfBcjdSZDt"},{"type":"cite","kind":"narrative","label":"sutton_reinforcement_2018","identifier":"sutton_reinforcement_2018","children":[{"type":"text","value":"Sutton & Barto (2018)","key":"eOyM7tf6ba"}],"enumerator":"1","key":"GC0ArmJ8cX"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"inYgcWwNTL"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"MnTsMLL5nV"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"E5t0x6ldiY"}],"key":"UR7oqjSEtF"},{"type":"text","value":" (2022)","key":"lyluAQDYmm"}],"enumerator":"2","key":"e1NYAWDFzg"},{"type":"text","value":".\nWe use ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"RuZUjrN4vV"},{"type":"inlineMath","value":"[N]","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"[N][N][N]","key":"cH7n4TwqsV"},{"type":"text","value":" as shorthand for the set ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"izl0lDnIjd"},{"type":"inlineMath","value":"\\{ 0, 1, \\dots, N-1 \\}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"{0,1,,N1}\\{ 0, 1, \\dots, N-1 \\}{0,1,,N1}","key":"XWmwQUvbvI"},{"type":"text","value":".","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"k4Gf2QuKJA"}],"key":"BTsC8sj9sA"},{"type":"table","position":{"start":{"line":144,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Element","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"pdODWy2Hze"}],"key":"ECsXN1H3eD"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Space","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"PpEqn0dhkO"}],"key":"JLARZmR93y"},{"type":"tableCell","header":true,"align":"left","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Definition (of element)","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"pdEkeijIzB"}],"key":"F6iwuiNTzP"}],"key":"GaRS2fqhTA"},{"type":"tableRow","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"sss","key":"QEydzkP8qb"}],"key":"Dmq7QzVIkU"},{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"S\\mathcal{S}S","key":"TyeHcacX1Y"}],"key":"nePQC8VvfI"},{"type":"tableCell","align":"left","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"A state.","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"key":"BTX6YgkpU6"}],"key":"NlwElIIk8a"}],"key":"el0MNOG4ko"},{"type":"tableRow","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"aaa","key":"w01awgxYMw"}],"key":"YLXwnL4nvL"},{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"A\\mathcal{A}A","key":"wbynRBXDFS"}],"key":"cpg0VYB2i5"},{"type":"tableCell","align":"left","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"An action.","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"dQIRmoM37s"}],"key":"I8WVgcdcGl"}],"key":"kRicxd2yXa"},{"type":"tableRow","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"html":"rrr","key":"CNVunJFk9j"}],"key":"NQPhHmMy6X"},{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[],"key":"TADRCd4Gtl"},{"type":"tableCell","align":"left","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"A reward.","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"JA0kFMC0ia"}],"key":"r1V3IRa83A"}],"key":"i3o7MDV8FS"},{"type":"tableRow","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"γ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"G9OU8IFAnh"}],"key":"sP1TeCvtZw"},{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[],"key":"enn892mXzJ"},{"type":"tableCell","align":"left","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"A discount factor.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"ALtkbKC66c"}],"key":"qSwbi1rl0D"}],"key":"tQQzhthnqc"},{"type":"tableRow","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"τ","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"OLGqtOjRGv"}],"key":"tQ0ENYW48b"},{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{T}","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"html":"T\\mathcal{T}T","key":"Q2R4HP6zJz"}],"key":"eOMP3MH9NP"},{"type":"tableCell","align":"left","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"A trajectory.","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"ujjwKmnZ32"}],"key":"etE8EK3DbP"}],"key":"qfdockbjBj"},{"type":"tableRow","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"UOluUduBW9"}],"key":"LioZGGsFsW"},{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"TiPf7m3xAQ"}],"key":"N4ifV8yZXi"},{"type":"tableCell","align":"left","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"A policy.","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"gK3igbSoGp"}],"key":"iPdnlHuoNr"}],"key":"ZSYC7WyIQV"},{"type":"tableRow","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"VπV^\\piVπ","key":"CP3cvWSr4W"}],"key":"xrJ4fSArRH"},{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"SR\\mathcal{S} \\to \\mathbb{R}SR","key":"eJ85SYTdTt"}],"key":"oTagaTUQMk"},{"type":"tableCell","align":"left","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"The value function of policy ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"RL4fVpFUtl"},{"type":"text","value":"π","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"hwzRdwTu7m"},{"type":"text","value":".","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"v34wqaDsD1"}],"key":"ZaNzjl7Xxg"}],"key":"m98bnMIFPP"},{"type":"tableRow","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"QπQ^\\piQπ","key":"LhYexj26Sc"}],"key":"TFNXZCv63v"},{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"qmRb1XMZTQ"}],"key":"dfintz76jq"},{"type":"tableCell","align":"left","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"The action-value function (a.k.a. Q-function) of policy ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"CNfeUIZhUd"},{"type":"text","value":"π","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"zMMPM9rNxR"},{"type":"text","value":".","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"WXT1Ck337n"}],"key":"fFHM0D8yOH"}],"key":"OkfwzXlzSF"},{"type":"tableRow","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"A^\\pi","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"AπA^\\piAπ","key":"IHriSRc4Bo"}],"key":"WhmTtAWXgm"},{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"lyEEVYxPmH"}],"key":"OdPzwgiPvM"},{"type":"tableCell","align":"left","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The advantage function of policy ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"kydcbN84gI"},{"type":"text","value":"π","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"YTNfxp6RWz"},{"type":"text","value":".","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"x6dqMbzNtE"}],"key":"HSDV3bzyYs"}],"key":"DpgEdZXGXC"},{"type":"tableRow","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[],"key":"MSD4KAuthE"},{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"inlineMath","value":"\\triangle(\\mathcal{X})","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"(X)\\triangle(\\mathcal{X})(X)","key":"c4S3A7lLCW"}],"key":"aqihqGCFlY"},{"type":"tableCell","align":"left","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"A distribution supported on ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"nphoK45B4W"},{"type":"inlineMath","value":"\\mathcal{X}","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"X\\mathcal{X}X","key":"HMYDjgYA5Z"},{"type":"text","value":".","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"LUcbJu3x19"}],"key":"w2wzv3waGb"}],"key":"jVRq4eNE3E"},{"type":"tableRow","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"h\\hih","key":"rtoKnClvUt"}],"key":"RUnCHp7fkO"},{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"[\\hor]","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"[H][\\hor][H]","key":"MZgZWlUo8G"}],"key":"N3czgzItzj"},{"type":"tableCell","align":"left","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"Time horizon index of an MDP (subscript).","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"E4yuowaWdL"}],"key":"rAbvnmcfb2"}],"key":"AuEIzr4uIo"},{"type":"tableRow","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"k","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"kkk","key":"KIA4igHtkH"}],"key":"WYpGa5PFR5"},{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"[K]","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"[K][K][K]","key":"ugL7F5WPyp"}],"key":"xrAzoz75r8"},{"type":"tableCell","align":"left","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"text","value":"Arm index of a multi-armed bandit (superscript).","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"Vwsu2uUW15"}],"key":"WOTYLXiqSh"}],"key":"SFZxWbKcp2"},{"type":"tableRow","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"t","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"ttt","key":"rNuHdNKc5Q"}],"key":"uGQEzlz4LQ"},{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"[T]","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"[T][T][T]","key":"d1TS7wczSf"}],"key":"B0WP09C4Ll"},{"type":"tableCell","align":"left","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"text","value":"Iteration index of an algorithm (subscript).","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"twhCZuZZBG"}],"key":"iuMPlBX2SH"}],"key":"hOThgLpiSE"},{"type":"tableRow","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"CiF0FTQbwo"}],"key":"UnRjP1iUYn"},{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"Θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"sc8UQWYUSa"}],"key":"MueJpYAa4s"},{"type":"tableCell","align":"left","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"A set of parameters.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"UYlO6Wg6sg"}],"key":"wfI2gat0d2"}],"key":"zyBZmjsaRF"}],"key":"PUlxxyxGEE"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Note that throughout the text, certain symbols will stand for either random variables or fixed values.\nWe aim to clarify in ambiguous settings.\nBe warned that","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"azWQPDDSPk"}],"key":"mEJbFq29t9"}],"key":"xAC2mzqycs"},{"type":"block","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"children":[{"type":"text","value":"Programming","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"MFX4otGzx2"}],"label":"programming","identifier":"programming","html_id":"programming","enumerator":"6","key":"E2mtA8gAj5"},{"type":"paragraph","position":{"start":{"line":170,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"Why include code in a textbook?\nWe believe that implementing an algorithm is a strong test of your understanding of it;\nmathematical notation can often abstract away details,\nwhile a computer must be given every single instruction.\nWe have sought to write readable Python code that is self-contained within each file.\nThis approach is inspired by ","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"UcTGTNFAmv"},{"type":"cite","kind":"narrative","label":"sussman_functional_2013","identifier":"sussman_functional_2013","children":[{"type":"text","value":"Sussman ","key":"ZyRxy8tJsB"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"HtxR0blNIv"}],"key":"gBNGGJrVUn"},{"type":"text","value":" (2013)","key":"BOXaPfprgS"}],"enumerator":"3","key":"Isw087zmVB"},{"type":"text","value":".\nThere are some ways in which the code style differs from typical software projects:","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"IK07cbsPOY"}],"key":"a0sX6u2r2G"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":178,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":178,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"We keep use of language features to a minimum,\neven if it leads to code that could otherwise be more concisely or idiomatically expressed.","position":{"start":{"line":178,"column":1},"end":{"line":178,"column":1}},"key":"Fv1o8BAzlA"}],"key":"bpoQmrNIyN"},{"type":"listItem","spread":true,"position":{"start":{"line":180,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"The variable names used in the code match those used in the main text.\nFor example, the variable ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"jqWMRfkYj3"},{"type":"inlineCode","value":"s","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"Aw4MM0yRCM"},{"type":"text","value":" will be used instead of the more explicit ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"Hu3FdOmVJb"},{"type":"inlineCode","value":"state","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"K0j3S1KWeX"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"dkVLkAohKD"}],"key":"ykn8XL20xq"}],"key":"Tap98fMntg"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"We also make extensive use of Python ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"YNjUNAPc7E"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"type annotations","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"evYQIn1OX6"}],"key":"g68uf109FT"},{"type":"text","value":" to explicitly specify variable types, including shapes of vectors and matrices using the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"efCfK5ufsd"},{"type":"link","url":"https://github.com/patrick-kidger/jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"eXs5j7Jw6P"}],"urlSource":"https://github.com/patrick-kidger/jaxtyping","error":true,"key":"XvL3Gld8mX"},{"type":"text","value":" library.","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"ZRFdC5sXnf"}],"key":"SJyY3TtRrb"},{"type":"paragraph","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"This is an interactive book built with ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"pEoZNTTH51"},{"type":"link","url":"https://jupyterbook.org/en/stable/intro.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Jupyter Book","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"tul3RhP9SJ"}],"urlSource":"https://jupyterbook.org/en/stable/intro.html","key":"NYfae6xRUi"},{"type":"text","value":".\nIt uses ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"TfXJ0SLDDb"},{"type":"link","url":"https://docs.python.org/3.11/contents.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Python 3.11","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"hOk5PyFJyM"}],"urlSource":"https://docs.python.org/3.11/contents.html","key":"mQsvxMfH6c"},{"type":"text","value":".\nIt uses the ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"QTLDA2Yind"},{"type":"link","url":"https://jax.readthedocs.io/en/latest/index.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"JAX","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"eyH0NgddU2"}],"urlSource":"https://jax.readthedocs.io/en/latest/index.html","key":"jvwYMclqR5"},{"type":"text","value":" library for numerical computing.\nJAX was chosen for the clarity of its functional style and due to its mature RL ecosystem,\nsustained in large part by the Google DeepMind research group and a large body of open-source contributors.\nWe use the standard ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"tOnlXbqump"},{"type":"link","url":"https://gymnasium.farama.org/","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Gymnasium","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"kigOOppYWT"}],"urlSource":"https://gymnasium.farama.org/","key":"xzD4CzpxwD"},{"type":"text","value":" library for interfacing with RL environments.","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"WHyRITRcht"}],"key":"h3ex4XxV5n"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"The following names are exported from the ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"SlKnPDkpmZ"},{"type":"inlineCode","value":"utils","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"eFsosDWUev"},{"type":"text","value":" module:","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"Wr0PaiNoCG"}],"key":"Jq2scvz7cC"},{"type":"code","lang":"python","value":"import matplotlib.pyplot as plt\n\n# convenient class builder\nfrom typing import NamedTuple\n\n# function typings\nfrom collections.abc import Callable\n\n# array typings\nfrom jaxtyping import Float, Array\n\n# convenient function composition\nfrom functools import partial\n\n# numerical computing and linear algebra\nimport jax\nimport jax.numpy as jnp\n\n# print functions as latex\nimport latexify\n\nplt.style.use(\"fivethirtyeight\")","position":{"start":{"line":194,"column":1},"end":{"line":217,"column":1}},"key":"kjqBKIYtfL"}],"key":"TxNpnPxA1V"}],"key":"t6kBEsBLDo"},"references":{"cite":{"order":["sutton_reinforcement_2018","agarwal_reinforcement_2022","sussman_functional_2013"],"data":{"sutton_reinforcement_2018":{"label":"sutton_reinforcement_2018","enumerator":"1","html":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction (Second edition). The MIT Press."},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"2","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."},"sussman_functional_2013":{"label":"sussman_functional_2013","enumerator":"3","html":"Sussman, G. J., Wisdom, J., & Farr, W. (2013). Functional Differential Geometry. The MIT Press."}}}},"footer":{"navigation":{"next":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/mdps.html b/mdps.html index f632430..6333bf5 100644 --- a/mdps.html +++ b/mdps.html @@ -14,10 +14,10 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

1 Markov Decision Processes

1.1Introduction

The field of RL studies how an agent can learn to make sequential decisions in an interactive environment. This is a very general problem! How can we formalize this task in a way that is both sufficiently general yet also tractable enough for fruitful analysis?

Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:

  • Board games and video games, where a player takes actions in a virtual environment.
  • Inventory management, where a company must efficiently move resources from producers to consumers.
  • Robotic control, where a robot can move and interact with the real world to complete some task.

In these environments and many others, the state transitions, the “rules” of the environment, @@ -28,7 +28,7 @@ the past history of moves doesn’t matter (generally speaking). This is called the Markov property.

Environments that satisfy the Markov property are called Markov decision processes (MDPs). This chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.

class MDP(NamedTuple):
     """A description of a Markov decision process with finitely many states and actions."""
     S: int  # number of states
     A: int  # number of actions
@@ -60,7 +60,7 @@
     P: Float[Array, "S A S"]  # "current" state, "current" action, "next" state
     r: Float[Array, "S A"]
     H: int
-    γ: float = 1.0  # discount factor (used later)
tidy_mdp = MDP(
+\end{array}

Consider a time horizon of H=7\hor = 7 days (one interaction per day). Let +t=0t = 0 correspond to Monday and t=6t = 6 correspond to Sunday.

tidy_mdp = MDP(
     S=2,  # 0 = orderly, 1 = messy
     A=2,  # 0 = ignore, 1 = tidy
     μ=jnp.array([1.0, 0.0]),  # start in orderly state
@@ -100,26 +100,26 @@
         ]
     ]),
     H=7,
-)

1.2.2Policies

1.2.2Policies

Note that for finite state and action spaces, +i.e. π={π0,,πH1}.\pi = \{ \pi_0, \dots, \pi_{\hor-1} \}.

Note that for finite state and action spaces, we can represent a randomized mapping SΔ(A)\mathcal{S} \to \Delta(\mathcal{A}) as a matrix π[0,1]S×A\pi \in [0, 1]^{\mathcal{S} \times \mathcal{A}} where each row describes the policy’s distribution over actions for the corresponding state.

A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy! Intuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision. We’ll prove this result constructively later in the chapter.

# arrays of shape (H, S, A) represent time-dependent policies
+and πh(orderly)=ignore\pi_\hi(\text{orderly}) = \text{ignore} for all h\hi.

# arrays of shape (H, S, A) represent time-dependent policies
 tidy_policy_always_tidy = (
     jnp.zeros((7, 2, 2))
     .at[:, :, 1].set(1.0)
@@ -133,21 +133,21 @@
     jnp.zeros((7, 2, 2))
     .at[:, 1, 1].set(1.0)
     .at[:, 0, 0].set(1.0)
-)

1.2.3Trajectories

class Transition(NamedTuple):
+immutability makes code much easier to reason about.

1.2.3Trajectories

class Transition(NamedTuple):
     """A single state-action-reward interaction with the environment.
 
     A trajectory comprises a sequence of transitions.
     """
     s: int
     a: int
-    r: float

Once we’ve chosen a policy, + r: float

Once we’ve chosen a policy, we can sample trajectories by repeatedly choosing actions according to the policy, -transitioning according to the state transitions, and observing the rewards.

That is, a policy induces a distribution ρπ\rho^{\pi} over trajectories. -(We assume that μ and PP are clear from context.)

Note that for a state-dependent policy, using the Markov property Definition 1.1, -we can write down the likelihood function of this probability distribution in an autoregressive way (i.e. one timestep at a time):

def trajectory_log_likelihood(
+transitioning according to the state transitions, and observing the rewards.

That is, a policy induces a distribution ρπ\rho^{\pi} over trajectories. +(We assume that μ and PP are clear from context.)

Note that for a state-dependent policy, using the Markov property Definition 1.1, +we can write down the likelihood function of this probability distribution in an autoregressive way (i.e. one timestep at a time):

def trajectory_log_likelihood(
     mdp: MDP,
     τ: list[Transition],
     π: Float[Array, "S A"],
@@ -163,14 +163,14 @@
         total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])
         total += jnp.log(π[τ[i].s, τ[i].a])
 
-    return total

For a deterministic policy π, we have that πh(as)=I[a=πh(s)]\pi_\hi(a \mid s) = \mathbb{I}[a = \pi_\hi(s)]; + return total

For a deterministic policy π, we have that πh(as)=I[a=πh(s)]\pi_\hi(a \mid s) = \mathbb{I}[a = \pi_\hi(s)]; that is, the probability of taking an action is 1 if it’s the unique action prescribed by the policy for that state and 0 otherwise. -In this case, the only randomness in sampling trajectories comes from the initial state distribution μ and the state transitions PP.

1.2.4Value functions

The main goal of RL is to find a policy that maximizes the expected total +In this case, the only randomness in sampling trajectories comes from the initial state distribution μ and the state transitions PP.

1.2.4Value functions

The main goal of RL is to find a policy that maximizes the expected total reward E[r0++rH1]\E [r_0 + \cdots + r_{\hor-1}].

Let’s introduce some notation for analyzing this quantity.

A policy’s value function at time h\hi is its expected remaining reward from a given state:

Similarly, we can define the action-value function (aka the -Q-function) at time hh as the expected remaining reward from a given state and taking a given action:

1.2.4.1Relating the value function and action-value function

Note that the value function is just the expected action-value over -actions drawn from the policy:

Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\hi^\pi(s) = \E_{a \sim \pi_\hi(s)} [Q_\hi^\pi(s, a)]
def q_to_v(
+Describe the generating process.

Let’s introduce some notation for analyzing this quantity.

A policy’s value function at time h\hi is its expected remaining reward from a given state:

Similarly, we can define the action-value function (aka the +Q-function) at time hh as the expected remaining reward from a given state and taking a given action:

1.2.4.1Relating the value function and action-value function

Note that the value function is just the expected action-value over +actions drawn from the policy:

Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\hi^\pi(s) = \E_{a \sim \pi_\hi(s)} [Q_\hi^\pi(s, a)]
def q_to_v(
     policy: Float[Array, "S A"],
     q: Float[Array, "S A"],
 ) -> Float[Array, " S"]:
@@ -178,8 +178,8 @@
     Compute the value function for a given policy in a known finite MDP
     at a single timestep from its action-value function.
     """
-    return jnp.average(q, weights=policy, axis=1)

and the action-value is the sum of the immediate reward and the expected value of the following -state:

Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\hi^\pi(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [V_{\hi+1}^\pi(s')]
def v_to_q(
+    return jnp.average(q, weights=policy, axis=1)

and the action-value is the sum of the immediate reward and the expected value of the following +state:

Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\hi^\pi(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [V_{\hi+1}^\pi(s')]
def v_to_q(
     mdp: MDP,
     v_next: Float[Array, " S"],
 ) -> Float[Array, "S A"]:
@@ -192,7 +192,7 @@
 
 
 # convert a list of v functions to a list of q functions
-v_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))

1.2.4.2Greedy policies

For any given QRS×AQ \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}, we can define the greedy policy π^Q\hat \pi_Q as the deterministic policy that selects the action with the highest QQ-value at each state:

π^Q(s)=argmaxaQsa\hat \pi_Q(s) = \arg\max_{a} Q_{sa}
def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]:
+v_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))

1.2.4.2Greedy policies

For any given QRS×AQ \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}, we can define the greedy policy π^Q\hat \pi_Q as the deterministic policy that selects the action with the highest QQ-value at each state:

π^Q(s)=argmaxaQsa\hat \pi_Q(s) = \arg\max_{a} Q_{sa}
def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]:
     """
     Get the (deterministic) greedy policy with respect to an action-value function.
     Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.
@@ -204,11 +204,11 @@
 
 def v_to_greedy(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, "S A"]:
     """Get the (deterministic) greedy policy with respect to a value function."""
-    return q_to_greedy(v_to_q(mdp, v))

1.2.5The one-step (Bellman) consistency equation

Note that by simply considering the cumulative reward as the sum of the + return q_to_greedy(v_to_q(mdp, v))

1.2.5The one-step (Bellman) consistency equation

Note that by simply considering the cumulative reward as the sum of the current reward and the future cumulative reward, we can describe the value function recursively (in terms of itself). This is named the Bellman consistency equation after Richard Bellman (1920--1984), -who is credited with introducing dynamic programming in 1953.

def check_bellman_consistency_v(
+who is credited with introducing dynamic programming in 1953.

def check_bellman_consistency_v(
     mdp: MDP,
     policy: Float[Array, "H S A"],
     v_ary: Float[Array, "H S"],
@@ -225,18 +225,18 @@
             jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),
         )
         for h in range(mdp.H - 1)
-    )

One can analogously derive the Bellman consistency equation for the -action-value function:

1.2.6The one-step Bellman operator

Fix a policy π. Consider the higher-order operator that takes in a +\end{aligned}

1.2.6The one-step Bellman operator

Fix a policy π. Consider the higher-order operator that takes in a “value function” v:SRv : \mathcal{S} \to \mathbb{R} and returns the r.h.s. of the Bellman -equation for that “value function”:

def bellman_operator_looping(
     mdp: MDP,
     policy: Float[Array, "S A"],
     v: Float[Array, " S"],
@@ -254,18 +254,18 @@
                     * mdp.P[s, a, s_next]
                     * (mdp.r[s, a] + mdp.γ * v[s_next])
                 )
-    return v_new

Note that we can concisely implement this using the q_to_v and v_to_q utilities from above:

def bellman_operator(
+    return v_new

Note that we can concisely implement this using the q_to_v and v_to_q utilities from above:

def bellman_operator(
     mdp: MDP,
     policy: Float[Array, "S A"],
     v: Float[Array, " S"],
 ) -> Float[Array, " S"]:
     """For a known finite MDP, the Bellman operator can be exactly evaluated."""
     return q_to_v(policy, v_to_q(mdp, v))  # equivalent
-    return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)

We’ll call Jπ:RSRS\mathcal{J}^\pi : \mathbb{R}^\mathcal{S} \to \mathbb{R}^\mathcal{S} the Bellman + return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)

We’ll call Jπ:RSRS\mathcal{J}^\pi : \mathbb{R}^\mathcal{S} \to \mathbb{R}^\mathcal{S} the Bellman operator of π. Note that it’s defined on any “value function” mapping states to real numbers; vv doesn’t have to be a well-defined value function for some policy (hence the lowercase notation). -The Bellman operator also gives us a concise way to express Theorem 1.1 for the value function:

Vhπ=Jπ(Vh+1π)V_\hi^\pi = \mathcal{J}^{\pi}(V_{\hi+1}^\pi)

Intuitively, the output of the Bellman operator, a new “value function”, +The Bellman operator also gives us a concise way to express Theorem 1.1 for the value function:

Vhπ=Jπ(Vh+1π)V_\hi^\pi = \mathcal{J}^{\pi}(V_{\hi+1}^\pi)

Intuitively, the output of the Bellman operator, a new “value function”, evaluates states as follows: from a given state, take one action according to π, observe the reward, and then evaluate the next state using the input “value function”.

When we discuss infinite-horizon MDPs, the Bellman operator will turn @@ -278,18 +278,18 @@ timestep h\hi as a function of the value function at timestep h+1\hi+1. This means we can start at the end of the time horizon, where the value is known, and work backwards in time, using the Bellman consistency -equation to compute the value function at each time step.

def dp_eval_finite(mdp: MDP, policy: Float[Array, "S A"]) -> Float[Array, "H S"]:
+equation to compute the value function at each time step.

def dp_eval_finite(mdp: MDP, policy: Float[Array, "S A"]) -> Float[Array, "H S"]:
     """Evaluate a policy using dynamic programming."""
     V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)]  # initialize to 0 at end of time horizon
     for h in range(mdp.H - 1, -1, -1):
         V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])
-    return jnp.stack(V_ary[:-1])

This runs in time O(HS2A)O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|) by counting the + return jnp.stack(V_ary[:-1])

This runs in time O(HS2A)O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|) by counting the loops.

V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)
-V_messy
Array([[5.5621696, 4.7927704], +\end{aligned}

etc. You may wish to repeat this computation for the +other policies to get a better sense of this algorithm.

V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)
+V_messy
Array([[5.5621696, 4.7927704], [4.7927704, 4.0241003], [4.0241003, 3.253 ], [3.253 , 2.49 ], [2.49 , 1.7 ], [1.7 , 1. ], - [1. , 0. ]], dtype=float32)

1.3.2Optimal policies in finite-horizon MDPs

We’ve just seen how to evaluate a given policy. But how can we find + [1. , 0. ]], dtype=float32)

1.3.2Optimal policies in finite-horizon MDPs

We’ve just seen how to evaluate a given policy. But how can we find the optimal policy for a given environment?

Convince yourself that all optimal policies must have the same value function. We call this the optimal value function and denote it by @@ -330,25 +330,25 @@ Qh(s,a)Q_\hi^\star(s, a).

It is a stunning fact that every finite-horizon MDP has an optimal policy that is time-dependent and deterministic. In particular, we can construct such a policy by acting greedily with respect to the optimal -action-value function:

Note that this also gives simplified forms of the Bellman consistency equations for the optimal policy:

Note that this also gives simplified forms of the Bellman consistency equations for the optimal policy:

Now that we’ve shown this particular greedy policy is optimal, all we +\end{aligned}

Now that we’ve shown this particular greedy policy is optimal, all we need to do is compute the optimal value function and optimal policy. We can do this by working backwards in time using dynamic programming (DP).

def find_optimal_policy(mdp: MDP):
+\end{aligned}
def find_optimal_policy(mdp: MDP):
     Q = [None] * mdp.H
     pi = [None] * mdp.H
     V = [None] * mdp.H + [jnp.zeros(mdp.S)]  # initialize to 0 at end of time horizon
@@ -387,7 +387,7 @@
     pi = jnp.stack(pi)
     V = jnp.stack(V[:-1])
 
-    return pi, V, Q

At each of the HH timesteps, we must compute QQ^{\star} for each of + return pi, V, Q

At each of the HH timesteps, we must compute QQ^{\star} for each of the SA|\mathcal{S}| |\mathcal{A}| state-action pairs. Each computation takes S|\mathcal{S}| operations to evaluate the average value over ss'. This gives a total computation time of O(HS2A)O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|).

Note that this algorithm is identical to the policy evaluation algorithm @@ -395,11 +395,11 @@ actions chosen by a policy, we instead simply take a maximum over the action-values. We’ll see this relationship between policy evaluation and optimal policy computation show up again in the infinite-horizon -setting.

π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)
+setting.

π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)
 assert jnp.allclose(π_opt, tidy_policy_messy_only)
 assert jnp.allclose(V_opt, V_messy)
 assert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])
-"Assertions passed (the 'tidy when messy' policy is optimal)"
"Assertions passed (the 'tidy when messy' policy is optimal)"

1.4Infinite-horizon MDPs

What happens if a trajectory is allowed to continue forever (i.e. +"Assertions passed (the 'tidy when messy' policy is optimal)"

"Assertions passed (the 'tidy when messy' policy is optimal)"

1.4Infinite-horizon MDPs

What happens if a trajectory is allowed to continue forever (i.e. H=H = \infty)? This is the setting of infinite horizon MDPs.

In this chapter, we’ll describe the necessary adjustments from the finite-horizon case to make the problem tractable. We’ll show that the Bellman operator in the discounted reward setting is a @@ -412,7 +412,7 @@ rh+rh+1+rh+2+r_\hi + r_{\hi+1} + r_{\hi+2} + \cdots is no longer a good idea since it might blow up to infinity. Instead of a time horizon HH, we now need a discount factor γ[0,1)\gamma \in [0, 1) such that rewards become less -valuable the further into the future they are:

rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\hi + \gamma r_{\hi+1} + \gamma^2 r_{\hi+2} + \cdots = \sum_{k=0}^\infty \gamma^k r_{\hi+k}.

We can think of γ as measuring how much we care about the future: +valuable the further into the future they are:

rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\hi + \gamma r_{\hi+1} + \gamma^2 r_{\hi+2} + \cdots = \sum_{k=0}^\infty \gamma^k r_{\hi+k}.

We can think of γ as measuring how much we care about the future: if it’s close to 0, we only care about the near-term rewards; it’s close to 1, we put more weight into future rewards.

You can also analyze γ as the probability of continuing the trajectory at each time step. (This is equivalent to HH being @@ -422,7 +422,7 @@ γ is close to 1, the trajectory will likely continue for a long time.

The other components of the MDP remain the same:

M=(S,A,μ,P,r,γ).M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \gamma).

Code-wise, we can reuse the MDP class from before Definition 1.2 and set mdp.H = float('inf').

tidy_mdp_inf = tidy_mdp._replace(H=float("inf"), γ=0.95)

1.4.2Stationary policies

The time-dependent policies from the finite-horizon case become +useful to review geometric series.

The other components of the MDP remain the same:

M=(S,A,μ,P,r,γ).M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \gamma).

Code-wise, we can reuse the MDP class from before Definition 1.2 and set mdp.H = float('inf').

tidy_mdp_inf = tidy_mdp._replace(H=float("inf"), γ=0.95)

1.4.2Stationary policies

The time-dependent policies from the finite-horizon case become difficult to handle in the infinite-horizon case. In particular, many of the DP approaches we saw required us to start at the end of the trajectory, which is no longer possible. We’ll shift to stationary @@ -437,15 +437,15 @@ time step we condition on when defining the value function?

1.5Solving infinite-horizon MDPs

1.5.1The Bellman operator is a contraction mapping

Recall from Definition 1.8 that the Bellman operator Jπ\mathcal{J}^{\pi} for a policy π takes in a “value function” v:SRv : \mathcal{S} \to \mathbb{R} and returns the r.h.s. of the Bellman equation for that “value function”. In -the infinite-horizon setting, this is

[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\mathcal{J}^{\pi}(v)](s) := \E_{\substack{a \sim \pi(s) \\ s' \sim P(s, a)}} [r(s, a) + \gamma v(s')].

The crucial property of the Bellman operator is that it is a +the infinite-horizon setting, this is

[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\mathcal{J}^{\pi}(v)](s) := \E_{\substack{a \sim \pi(s) \\ s' \sim P(s, a)}} [r(s, a) + \gamma v(s')].

The crucial property of the Bellman operator is that it is a contraction mapping for any policy. Intuitively, if we start with two “value functions” v,u:SRv, u : \mathcal{S} \to \mathbb{R}, if we repeatedly apply the Bellman operator to each of them, they will get closer and closer together at an exponential rate.

It is a powerful fact (known as the Banach fixed-point theorem) that every contraction mapping has a unique fixed point xx^\star such @@ -453,61 +453,61 @@ to any starting point, we will eventually converge to xx^\star:

f(t)(x)xγtxx.\|f^{(t)}(x) - x^\star\| \le \gamma^t \|x - x^\star\|.

Let’s return to the RL setting and apply this result to the Bellman operator. How can we measure the distance between two “value functions” v,u:SRv, u : \mathcal{S} \to \mathbb{R}? We’ll take the supremum norm as our distance -metric:

vu:=supsSv(s)u(s),\| v - u \|_{\infty} := \sup_{s \in \mathcal{S}} |v(s) - u(s)|,

i.e. +metric:

vu:=supsSv(s)u(s),\| v - u \|_{\infty} := \sup_{s \in \mathcal{S}} |v(s) - u(s)|,

i.e. we compare the “value functions” on the state that causes the biggest gap between them. Then (1.36) implies that if we repeatedly apply Jπ\mathcal{J}^\pi to any starting “value function”, we will eventually converge to VπV^\pi:

(Jπ)(t)(v)VπγtvVπ.\|(\mathcal{J}^\pi)^{(t)}(v) - V^\pi \|_{\infty} \le \gamma^{t} \| v - V^\pi\|_{\infty}.

We’ll use this useful fact to prove the convergence of several -algorithms later on.

1.5.2Policy evaluation in infinite-horizon MDPs

The backwards DP technique we used in the finite-horizon case no +\end{aligned}

1.5.2Policy evaluation in infinite-horizon MDPs

The backwards DP technique we used in the finite-horizon case no longer works since there is no “final timestep” to start from. We’ll need another approach to policy evaluation.

The Bellman consistency conditions yield a system of equations we can solve to evaluate a deterministic policy exactly. For a faster approximate solution, we can iterate the policy’s Bellman operator, since we know that it has a unique fixed point at the true value function.

1.5.2.1Matrix inversion for deterministic policies

Note that when the policy π is deterministic, the actions can be determined from the states, and so we can chop off the action dimension -for the rewards and state transitions:

rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\begin{aligned} +for the rewards and state transitions:

rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\begin{aligned} r^{\pi} &\in \mathbb{R}^{|\mathcal{S}|} & P^{\pi} &\in [0, 1]^{|\mathcal{S}| \times |\mathcal{S}|} & \mu &\in [0, 1]^{|\mathcal{S}|} \\ \pi &\in \mathcal{A}^{|\mathcal{S}|} & V^\pi &\in \mathbb{R}^{|\mathcal{S}|} & Q^\pi &\in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}. -\end{aligned}

For PπP^\pi, we’ll treat the rows as the states and the +\end{aligned}

For PπP^\pi, we’ll treat the rows as the states and the columns as the next states. Then Ps,sπP^\pi_{s, s'} is the probability of transitioning from state ss to state ss' under policy π.

The Bellman consistency equation for a deterministic policy can be -written in tabular notation as

Vπ=rπ+γPπVπ.V^\pi = r^\pi + \gamma P^\pi V^\pi.

(Unfortunately, this notation doesn’t simplify the expression for +written in tabular notation as

Vπ=rπ+γPπVπ.V^\pi = r^\pi + \gamma P^\pi V^\pi.

(Unfortunately, this notation doesn’t simplify the expression for QπQ^\pi.) This system of equations can be solved with a matrix inversion:

Vπ=(IγPπ)1rπ.V^\pi = (I - \gamma P^\pi)^{-1} r^\pi.
def eval_deterministic_infinite(
+least one nonzero element.)

def eval_deterministic_infinite(
     mdp: MDP, policy: Float[Array, "S A"]
 ) -> Float[Array, " S"]:
     pi = jnp.argmax(policy, axis=1)  # un-one-hot
     P_π = mdp.P[jnp.arange(mdp.S), pi]
     r_π = mdp.r[jnp.arange(mdp.S), pi]
-    return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)
eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])
Array([15.56419, 14.78598], dtype=float32)

1.5.2.2Iterative policy evaluation

The matrix inversion above takes roughly O(S3)O(|\mathcal{S}|^3) time. +lower than this.

eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])
Array([15.56419, 14.78598], dtype=float32)

1.5.2.2Iterative policy evaluation

The matrix inversion above takes roughly O(S3)O(|\mathcal{S}|^3) time. It also only works for deterministic policies. Can we trade off the requirement of finding the exact value function for a faster approximate algorithm that will also extend to stochastic policies?

Let’s use the Bellman operator to define an iterative algorithm for computing the value function. We’ll start with an initial guess v(0)v^{(0)} with elements in [0,1/(1γ)][0, 1/(1-\gamma)] and then iterate the -Bellman operator:

v(t+1)=Jπ(v(t)),v^{(t+1)} = \mathcal{J}^{\pi}(v^{(t)}),

i.e. v(t)=(Jπ)(t)(v(0))v^{(t)} = (\mathcal{J}^{\pi})^{(t)} (v^{(0)}). Note that each iteration -takes O(S2)O(|\mathcal{S}|^2) time for the matrix-vector multiplication.

def supremum_norm(v):
+Bellman operator:

v(t+1)=Jπ(v(t)),v^{(t+1)} = \mathcal{J}^{\pi}(v^{(t)}),

i.e. v(t)=(Jπ)(t)(v(0))v^{(t)} = (\mathcal{J}^{\pi})^{(t)} (v^{(0)}). Note that each iteration +takes O(S2)O(|\mathcal{S}|^2) time for the matrix-vector multiplication.

def supremum_norm(v):
     return jnp.max(jnp.abs(v))  # same as jnp.linalg.norm(v, jnp.inf)
 
 
@@ -522,13 +522,13 @@
 
 def iterative_evaluation(mdp: MDP, pi: Float[Array, "S A"], ε=1e-6) -> Float[Array, " S"]:
     op = partial(bellman_operator, mdp, pi)
-    return loop_until_convergence(op, jnp.zeros(mdp.S), ε)

Then, as we showed in (1.38), by the Banach fixed-point theorem:

v(t)Vπγtv(0)Vπ.\|v^{(t)} - V^\pi \|_{\infty} \le \gamma^{t} \| v^{(0)} - V^\pi\|_{\infty}.
iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])
Array([15.564166, 14.785956], dtype=float32)

1.5.3Optimal policies in infinite-horizon MDPs

Now let’s move on to solving for an optimal policy in the infinite-horizon case. As in the finite-horizon case, an optimal policy π\pi^\star @@ -545,76 +545,76 @@ equation (1.32) for the optimal value function doesn’t depend on any policy:

V(s)=maxa[r(s,a)+γEsP(s,a)V(s).]V^\star(s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} V^\star(s'). \right]

As before, thinking of the r.h.s. of (1.53) as an operator on value functions -gives the Bellman optimality operator

[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\mathcal{J}^{\star}(v)](s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v(s') \right]
def bellman_optimality_operator(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, " S"]:
+gives the Bellman optimality operator

[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\mathcal{J}^{\star}(v)](s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v(s') \right]
def bellman_optimality_operator(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, " S"]:
     return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)
 
 
 def check_optimal(v: Float[Array, " S"], mdp: MDP):
-    return jnp.allclose(v, bellman_optimality_operator(v, mdp))

1.5.3.1Value iteration

Since the optimal policy is still a policy, our result that the Bellman + return jnp.allclose(v, bellman_optimality_operator(v, mdp))

1.5.3.1Value iteration

Since the optimal policy is still a policy, our result that the Bellman operator is a contracting map still holds, and so we can repeatedly apply this operator to converge to the optimal value function! This -algorithm is known as value iteration.

def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, " S"]:
+algorithm is known as value iteration.

def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, " S"]:
     """Iterate the Bellman optimality operator until convergence."""
     op = partial(bellman_optimality_operator, mdp)
-    return loop_until_convergence(op, jnp.zeros(mdp.S), ε)
value_iteration(tidy_mdp_inf)
Array([15.564166, 14.785956], dtype=float32)

Note that the runtime analysis for an ε-optimal value function + return loop_until_convergence(op, jnp.zeros(mdp.S), ε)

value_iteration(tidy_mdp_inf)
Array([15.564166, 14.785956], dtype=float32)

Note that the runtime analysis for an ε-optimal value function is exactly the same as iterative policy evaluation! This is because value iteration is simply the special case of applying iterative policy evaluation to the optimal value function.

As the final step of the algorithm, to return an actual policy π^\hat \pi, we can simply act greedily with respect to the final iteration -v(T)v^{(T)} of our above algorithm:

π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\hat \pi(s) = \arg\max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v^{(T)}(s') \right].

We must be careful, though: the value function of this greedy policy, +v(T)v^{(T)} of our above algorithm:

π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\hat \pi(s) = \arg\max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v^{(T)}(s') \right].

We must be careful, though: the value function of this greedy policy, Vπ^V^{\hat \pi}, is not the same as v(T)v^{(T)}, which need not even be a well-defined value function for some policy!

The bound on the policy’s quality is actually quite loose: if v(T)Vϵ\|v^{(T)} - V^\star\|_{\infty} \le \epsilon, then the greedy policy π^\hat \pi satisfies Vπ^V2γ1γϵ\|V^{\hat \pi} - V^\star\|_{\infty} \le \frac{2\gamma}{1-\gamma} \epsilon, -which might potentially be very large.

So in order to compensate and achieve Vπ^Vϵ\|V^{\hat \pi} - V^{\star}\| \le \epsilon, we must have

v(T)V1γ2γϵ.\|v^{(T)} - V^\star\|_{\infty} \le \frac{1-\gamma}{2 \gamma} \epsilon.

This means, using Remark 1.2, we need to run value iteration for

T=O(11γlog(γϵ(1γ)2))T = O\left( \frac{1}{1-\gamma} \log\left(\frac{\gamma}{\epsilon (1-\gamma)^2}\right) \right)

iterations to achieve an ε-accurate estimate of the optimal value function.

1.5.3.2Policy iteration

Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function together? This is the idea behind policy iteration. In each step, we simply set the policy to act greedily with respect to its own value function.

def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]:
+\end{aligned}

So in order to compensate and achieve Vπ^Vϵ\|V^{\hat \pi} - V^{\star}\| \le \epsilon, we must have

v(T)V1γ2γϵ.\|v^{(T)} - V^\star\|_{\infty} \le \frac{1-\gamma}{2 \gamma} \epsilon.

This means, using Remark 1.2, we need to run value iteration for

T=O(11γlog(γϵ(1γ)2))T = O\left( \frac{1}{1-\gamma} \log\left(\frac{\gamma}{\epsilon (1-\gamma)^2}\right) \right)

iterations to achieve an ε-accurate estimate of the optimal value function.

1.5.3.2Policy iteration

Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function together? This is the idea behind policy iteration. In each step, we simply set the policy to act greedily with respect to its own value function.

def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]:
     """Iteratively improve the policy and value function."""
     def op(pi):
         return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))
     π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A  # uniform random policy
-    return loop_until_convergence(op, π_init, ε)
policy_iteration(tidy_mdp_inf)
Array([[1., 0.], - [0., 1.]], dtype=float32)

Although PI appears more complex than VI, we’ll use the same contraction property Theorem 1.4 to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ε-optimal value function Remark 1.2, although in practice, PI often converges much faster.

1.6Summary

  • Markov decision processes (MDPs) are a framework for sequential +\end{aligned}

This means we can now apply the Bellman convergence result (1.38) to get

Vπt+1VJ(Vπt)VγVπtV.\|V^{\pi^{t+1}} - V^\star \|_{\infty} \le \|\mathcal{J}^{\star} (V^{\pi^{t}}) - V^{\star}\|_{\infty} \le \gamma \|V^{\pi^{t}} - V^\star \|_{\infty}.

1.6Summary

  • Markov decision processes (MDPs) are a framework for sequential decision making under uncertainty. They consist of a state space S\mathcal{S}, an action space A\mathcal{A}, an initial state distribution μΔ(S)\mu \in \Delta(\mathcal{S}), a transition function P(ss,a)P(s' \mid s, a), and a @@ -634,9 +634,9 @@ functions exactly. Thinking of the r.h.s. of this equation as an operator on value functions gives the Bellman operator.

  • In the finite-horizon setting, we can compute the optimal policy using dynamic programming.

  • In the infinite-horizon setting, we can compute the optimal policy -using value iteration or policy iteration.

\ No newline at end of file diff --git a/mdps.json b/mdps.json index 636209e..7f4f24a 100644 --- a/mdps.json +++ b/mdps.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"32c2f6fe9e96648ecf8985a4e80db115d0d6950b01e46976348cc5f4529cd76f","slug":"mdps","location":"/mdps.md","dependencies":[],"frontmatter":{"title":"1 Markov Decision Processes","numbering":{"all":{"enabled":true},"enumerator":{"template":"1.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","thumbnailOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp","exports":[{"format":"md","filename":"mdps.md","url":"/build/mdps-eb86bf115f025d31fd89a81ae9f29e0d.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"APi66eKaK6"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"1.1","key":"UXjxFQ6v9C"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"The field of RL studies how an agent can learn to make sequential decisions in an interactive environment.\nThis is a very general problem!\nHow can we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"US4rR4bcRj"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"formalize","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"eHTRwegr0G"}],"key":"iWhnSkf7EX"},{"type":"text","value":" this task in a way that is both ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"sTy9bWVGhe"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"sufficiently general","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"tb3dOSjpzD"}],"key":"HnchCLPjhs"},{"type":"text","value":" yet also tractable enough for ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"PXnTuYiYkW"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"fruitful analysis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"sbggSEZHSk"}],"key":"qoMGh3jfTZ"},{"type":"text","value":"?","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"jygEwmhLOi"}],"key":"hQNFnnZQuI"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"mehY8djdJ5"}],"key":"Oht4edgnGs"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":26,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"strong","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"Board games and video games,","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"ailVOasbKL"}],"key":"SgZtU2ikPh"},{"type":"text","value":" where a player takes actions in a virtual environment.","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"WP6KfdcYjk"}],"key":"TBoyH8Ck4N"},{"type":"listItem","spread":true,"position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Inventory management,","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"lMSMg1MAVO"}],"key":"BwnK5COEan"},{"type":"text","value":" where a company must efficiently move resources from producers to consumers.","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"g6D7tnhyY3"}],"key":"D9JAsdiX58"},{"type":"listItem","spread":true,"position":{"start":{"line":28,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Robotic control","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"HtQRdqyD3L"}],"key":"jTBxcq5t6z"},{"type":"text","value":", where a robot can move and interact with the real world to complete some task.","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"LpECAZIdSU"}],"key":"CfPpA7rVN5"}],"key":"mgdbfihlE6"},{"type":"paragraph","position":{"start":{"line":30,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"In these environments and many others, the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"GxlfFJ0JSn"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"KNBMrUT9Ae"}],"key":"hYbyqPCD8X"},{"type":"text","value":",\nthe “rules” of the environment,\nonly depend on the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"cbZQYvXdy2"},{"type":"emphasis","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"most recent","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"dDynXZHtqc"}],"key":"q0zhPyrcZe"},{"type":"text","value":" state and action (generally speaking).\nFor example, if you want to take a break while playing a game of chess,\nyou could take a picture of the board,\nand later on reset the board to that state and continue playing;\nthe past history of moves doesn’t matter (generally speaking).\nThis is called the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"LS12sPfikU"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"Markov property.","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"gGCbvgfHQv"}],"key":"yxDLfhE3fF"}],"key":"HVBga85l6o"},{"type":"proof","kind":"definition","label":"markov","identifier":"markov","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Markov property","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"zLPIQcil1H"}],"key":"I5rl20B3un"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"An interactive environment satisfies the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"LHbUztpVjc"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"Markov property","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"gnxqmB6Blf"}],"key":"ZSnz1CEpKs"},{"type":"text","value":" if the\nprobability of transitioning to a new state only depends on the current\nstate and action:","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"kLjpgEx7fA"}],"key":"RhFoc6uAsr"},{"type":"math","value":"\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)","enumerator":"1.1","key":"NyJtEy8OAC"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"AoBDofX7f0"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"atrYIfzWDg"},{"type":"text","value":" describes the state transitions.\n(We’ll elaborate on this notation later in the chapter.)","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"ppXJydGJTA"}],"key":"cAMniGJFtJ"}],"enumerator":"1.1","html_id":"markov","key":"CNNPWkBcfi"},{"type":"paragraph","position":{"start":{"line":52,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"Environments that satisfy the Markov property are called ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"v6rQbitYYW"},{"type":"strong","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"ofx0J4FhdO"}],"key":"DFU0bdS1ai"},{"type":"text","value":" (MDPs).\nThis chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"JbGpxYOaIv"}],"key":"LXcrTJIn3g"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"UUh2aIPFNA"}],"key":"dgaA94kmf6"},{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":58,"column":1}},"children":[{"type":"text","value":"What information might be encoded in the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"FkcGXawuFa"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"pvj1W0WyMC"}],"key":"VCVHAJI2q0"},{"type":"text","value":" for each of the above examples?\nWhat might the valid set of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"JPKU3drmoi"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"cWAWwZA6j9"}],"key":"BIuBC9xcv3"},{"type":"text","value":" be?\nDescribe the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"XvXtVPH2LY"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"g6dnrCn0Fr"}],"key":"r99z9xCvf2"},{"type":"text","value":" heuristically and verify that they satisfy the Markov property.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"iOvCD8hZ8T"}],"key":"c4vT8rzvzJ"}],"key":"A1aP0prSHB"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"MDPs are usually classified as ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"bGjaeNMcKD"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"finite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"KOLTZHV9mS"}],"key":"MZHNUdSXHl"},{"type":"text","value":", where the interactions end after some finite number of time steps,\nor ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"bdbwL02Lre"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"infinite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"eG9RafiQ53"}],"key":"YoSDWkFSmi"},{"type":"text","value":", where the interactions can continue indefinitely.\nWe’ll begin with the finite-horizon case and discuss the infinite-horizon case in the second half of the chapter.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"trkXY95Wvl"}],"key":"pziIWzfDhG"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"We’ll describe how to ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"AgIqsr0ohf"},{"type":"emphasis","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"Dghz4VyNjW"}],"key":"jp3Bl0isYO"},{"type":"text","value":" different strategies, called ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"fJ0BbBUMG5"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"policies,","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"GCveG9AEIs"}],"key":"t8EC53ITd1"},{"type":"text","value":" and how to compute (or approximate)\nthe ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"PscIJcUxNO"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"vaf9lYlk9N"}],"key":"VNnd5ZLXEj"},{"type":"text","value":" for a given MDP.\nWe’ll introduce the ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"x3eiDJw5zQ"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"Bellman consistency condition","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"rvPeYlh1pD"}],"key":"m1GYtaQGKz"},{"type":"text","value":", which allows us to analyze the whole sequence of interactions in terms of individual timesteps.","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"aCPypIe4DK"}],"key":"W4VemYWRwx"}],"key":"qTmDhcEnvm"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import NamedTuple, Float, Array, partial, jax, jnp, latexify","key":"bQYqEeGSAP"},{"type":"output","id":"yXO7sSeD4aONtgWVeV1gk","data":[],"key":"oNcAdhgt9y"}],"data":{},"key":"b7BGovvRbl"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"Finite-horizon MDPs","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"SxKSLNKlLJ"}],"identifier":"finite-horizon-mdps","label":"Finite-horizon MDPs","html_id":"finite-horizon-mdps","implicit":true,"enumerator":"1.2","key":"eNrK75qQq4"},{"type":"heading","depth":3,"position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"Definition","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"du6yW1W3x8"}],"identifier":"definition","label":"Definition","html_id":"definition","implicit":true,"enumerator":"1.2.1","key":"imuCmdg0wz"},{"type":"proof","kind":"definition","label":"finite_horizon_mdp","identifier":"finite_horizon_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Finite-horizon Markov decision process","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"erXZcGHymm"}],"key":"Hbvz7yAd5W"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"The components of a finite-horizon Markov decision process are:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"TIWGCWYKXj"}],"key":"vDnk71iDhK"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":82,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":82,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":82,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"ERenBJFZCu"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"Upk2tRC8Yc"}],"key":"c7uTq1T1bH"},{"type":"text","value":" that the agent interacts with. We use ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"NbaugpNemk"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"html":"S\\mathcal{S}S","key":"D72hitKANq"},{"type":"text","value":" to denote\nthe set of possible states, called the ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"CzPjTTa2WE"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state space","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"monA2uRyul"}],"key":"gBpncCaxND"},{"type":"text","value":".","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"ry1okcCYFw"}],"key":"XXs3b8fKF8"}],"key":"rmmfCny56i"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":85,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"es5YIlAmZn"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"GjdvWwMRwv"}],"key":"pH9AyedG2r"},{"type":"text","value":" that the agent can take. We use ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"tgoJaWvYwO"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"A\\mathcal{A}A","key":"YA5H8lrm4r"},{"type":"text","value":" to denote the\nset of possible actions, called the ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"mDRR2K5VjE"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"action space","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"F4sWquLwOI"}],"key":"mYvNGVC4nD"},{"type":"text","value":".","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"r8kL8KgRWk"}],"key":"vlIs54vu32"}],"key":"eJ6OufuL5o"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":89,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"Some ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"P4Q6LBZxyj"},{"type":"strong","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"initial state distribution","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ShlblnEElV"}],"key":"PkutcLMRFG"},{"type":"text","value":" ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ieCwcipyol"},{"type":"inlineMath","value":"\\mu \\in \\triangle(\\mathcal{S})","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"μ(S)\\mu \\in \\triangle(\\mathcal{S})μ(S)","key":"Rj8UwvvhSp"},{"type":"text","value":".","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"N0yEPbTZzq"}],"key":"voJxg8slJx"}],"key":"XgSfyeScdY"},{"type":"listItem","spread":true,"position":{"start":{"line":90,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":90,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"nzrBff8BTM"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"yYRFT7Ohqm"}],"key":"RuWZitZStD"},{"type":"text","value":" (a.k.a. ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"iFAUZiKQXb"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"kzaaSfpXgK"}],"key":"XdPQMS4Cqj"},{"type":"text","value":")\n","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"URvSOeksb3"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"A5iLFxXUr0"},{"type":"text","value":" that describe what state the agent\ntransitions to after taking an action.","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"WovBqJWYDh"}],"key":"ZYZmdhckRK"}],"key":"aPSzXynn1Y"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"NKZIShrfXq"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"mpS8UQWITx"}],"key":"vvsxkVkO52"},{"type":"text","value":" signal. In this course we’ll take it to be a\ndeterministic function on state-action pairs,\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"BX1XlD6Jv9"},{"type":"inlineMath","value":"r : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r:S×ARr : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}r:S×AR","key":"DNjhax9hlD"},{"type":"text","value":", but in general many results will\nextend to a ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"rDdIsbiFR3"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"TrJAjDbhz4"}],"key":"jTZ3HlVfGY"},{"type":"text","value":" reward signal.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"uDi6Rxj6t1"}],"key":"XL6AzN8jc9"}],"key":"xhS3X4U9ge"},{"type":"listItem","spread":true,"position":{"start":{"line":99,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":99,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"text","value":"A time horizon ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"CpCJNjNAOO"},{"type":"inlineMath","value":"\\hor \\in \\mathbb{N}","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"html":"HN\\hor \\in \\mathbb{N}HN","key":"dLp1muwL74"},{"type":"text","value":" that specifies the number of\ninteractions in an ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"S2GuaYE9JN"},{"type":"strong","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"children":[{"type":"text","value":"episode","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"IpkZpZVDLa"}],"key":"sFyayguiyy"},{"type":"text","value":".","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"yCv3a84UrL"}],"key":"BfR8gO3zr0"}],"key":"Gp7VUVzxcA"}],"key":"sqKKf3SwZB"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Combined together, these objects specify a finite-horizon Markov\ndecision process:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"WHhGC98CIO"}],"key":"J8ryQNbhCB"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).","position":{"start":{"line":105,"column":1},"end":{"line":105,"column":1}},"html":"M=(S,A,μ,P,r,H).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).M=(S,A,μ,P,r,H).","enumerator":"1.2","key":"sL0thXQoCv"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"When there are ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"j2Yr245htH"},{"type":"strong","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"finitely","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"U6NSPx9HxD"}],"key":"joREOvLQHD"},{"type":"text","value":" many states and actions, i.e.\n","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"DzuGEheFpW"},{"type":"inlineMath","value":"|\\mathcal{S}|, |\\mathcal{A}| < \\infty","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"html":"S,A<|\\mathcal{S}|, |\\mathcal{A}| < \\inftyS,A<","key":"WZhyeey950"},{"type":"text","value":", we can express\nthe relevant quantities as vectors and matrices (i.e. ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"koEZbqLsnb"},{"type":"emphasis","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"tables","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"RDlBt0rG8a"}],"key":"tPLwo77Ngf"},{"type":"text","value":" of\nvalues):","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"OkgGcNejiJ"}],"key":"Z5tPH9glMw"},{"type":"math","value":"\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}","position":{"start":{"line":112,"column":1},"end":{"line":118,"column":1}},"html":"μ[0,1]SP[0,1](S×A)×SrRS×A\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}μ[0,1]SP[0,1](S×A)×SrRS×A","enumerator":"1.3","key":"twjLaCHppG"}],"enumerator":"1.2","html_id":"finite-horizon-mdp","key":"TthDziCKAk"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"D9VNnI22mX"}],"key":"ArgAWYud0q"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Verify that the types and shapes provided above make sense!","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"Q8vcKW1G9W"}],"key":"HOo44ho66g"}],"key":"MQTqoiJu1Y"}],"key":"O5jAbSw1um"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MDP(NamedTuple):\n \"\"\"A description of a Markov decision process with finitely many states and actions.\"\"\"\n S: int # number of states\n A: int # number of actions\n μ: Float[Array, \" S\"]\n P: Float[Array, \"S A S\"] # \"current\" state, \"current\" action, \"next\" state\n r: Float[Array, \"S A\"]\n H: int\n γ: float = 1.0 # discount factor (used later)","key":"C3I7wMaju4"},{"type":"output","id":"yqkIVW99tYaiX8xjFZJVw","data":[],"key":"IkPBYTdzPP"}],"data":{},"key":"I8vdvm6fFn"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_mdp","identifier":"tidy_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":137,"column":1},"end":{"line":137,"column":1}},"key":"bH1MhFP2PT"}],"key":"sjUWIXvY8k"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"Let’s consider a simple decision problem throughout this chapter:\nthe task of keeping your room tidy!","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"fskIC3aeeR"}],"key":"MzxwuKkEkg"},{"type":"paragraph","position":{"start":{"line":143,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"Your room has the possible states\n","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"rhEw2JQeP0"},{"type":"inlineMath","value":"\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"S={orderly,messy}.\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.S={orderly,messy}.","key":"xwdazEBDFy"},{"type":"text","value":"\nYou can take either of the actions ","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"oHs6CMDlCS"},{"type":"inlineMath","value":"\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"A={ignore,tidy}.\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.A={ignore,tidy}.","key":"xPaoBb0lAK"},{"type":"text","value":"\nThe room starts off orderly.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"pFOFV81zyP"}],"key":"wWK04aeOMi"},{"type":"paragraph","position":{"start":{"line":148,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"R0xefxzbe2"},{"type":"strong","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"VuOf9n2KpA"}],"key":"b7H7b9H0wS"},{"type":"text","value":" are as follows:\nif you tidy the room, it becomes (or remains) orderly;\nif you ignore the room, it ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"ATTiZeinyN"},{"type":"emphasis","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"might","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"Fw1KGYG5Aw"}],"key":"ba4DpPakfm"},{"type":"text","value":" become messy (see table below).","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"ni66YfqAwJ"}],"key":"nZqgCeexsm"},{"type":"paragraph","position":{"start":{"line":152,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"hC5WiUEgkn"},{"type":"strong","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"rewards","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"nF6XrC52MJ"}],"key":"eE73YDXygr"},{"type":"text","value":" are as follows: You get penalized for tidying an orderly room (a waste of time) or ignoring a messy room,\nbut you get rewarded for ignoring an orderly room (since you can enjoy your additional time).\nTidying a messy room is a chore that gives no reward.","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"o5D2q4bHH3"}],"key":"XK4GKmtkdi"},{"type":"paragraph","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"These are summarized in the following table:","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"r4012BPeX5"}],"key":"Fi7FN7O1sY"},{"type":"math","value":"\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}","position":{"start":{"line":158,"column":1},"end":{"line":164,"column":1}},"html":"saP(orderlys,a)P(messys,a)r(s,a)orderlyignore0.70.31orderlytidy101messyignore011messytidy100\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}sorderlyorderlymessymessyaignoretidyignoretidyP(orderlys,a)0.7101P(messys,a)0.3010r(s,a)1110","enumerator":"1.4","key":"C71CZA5l4I"},{"type":"paragraph","position":{"start":{"line":166,"column":1},"end":{"line":167,"column":1}},"children":[{"type":"text","value":"Consider a time horizon of ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"kXSaBbu2ia"},{"type":"inlineMath","value":"\\hor = 7","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"H=7\\hor = 7H=7","key":"cynVGXtF9l"},{"type":"text","value":" days (one interaction per day). Let\n","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"axA2GTS4xF"},{"type":"inlineMath","value":"t = 0","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=0t = 0t=0","key":"NBKlaZMXCQ"},{"type":"text","value":" correspond to Monday and ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"g8f3oCDmo1"},{"type":"inlineMath","value":"t = 6","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=6t = 6t=6","key":"QMhrv3Njir"},{"type":"text","value":" correspond to Sunday.","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"f3F3YTyNcd"}],"key":"B6WwLHHUW0"}],"enumerator":"1.1","html_id":"tidy-mdp","key":"EUPfBshqlQ"}],"key":"XxTJELSURa"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp = MDP(\n S=2, # 0 = orderly, 1 = messy\n A=2, # 0 = ignore, 1 = tidy\n μ=jnp.array([1.0, 0.0]), # start in orderly state\n P=jnp.array([\n [\n [0.7, 0.3], # orderly, ignore\n [1.0, 0.0], # orderly, tidy\n ],\n [\n [0.0, 1.0], # messy, ignore\n [1.0, 0.0], # messy, tidy\n ],\n ]),\n r=jnp.array([\n [\n 1.0, # orderly, ignore\n -1.0, # orderly, tidy\n ],\n [\n -1.0, # messy, ignore\n 0.0, # messy, tidy\n ]\n ]),\n H=7,\n)","key":"kSFaWMV7Dg"},{"type":"output","id":"Wymzg5odqeX7-IixgCI1O","data":[],"key":"jDS2YP7Bk5"}],"data":{},"key":"ZFYlIC1FOg"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"Policies","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"QfV547AvUr"}],"identifier":"policies","label":"Policies","html_id":"policies","implicit":true,"enumerator":"1.2.2","key":"HS1m6gcBtV"},{"type":"proof","kind":"definition","label":"policy","identifier":"policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"QJclZa9tNb"}],"key":"dSNkZnHn8C"},{"type":"paragraph","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"mEBes7m3p2"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"Q9kCKnzynx"}],"key":"k1u5paC35s"},{"type":"text","value":" ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"yUBcWcP2Xk"},{"type":"text","value":"π","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"r85W9I430o"},{"type":"text","value":" describes the agent’s strategy:\nwhich actions it takes in a given situation.\nA key goal of RL is to find the ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"AVdBu0On2i"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"GPUFOfy6OF"}],"key":"iArZGjAaRL"},{"type":"text","value":" that maximizes the total reward on average.","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"b2AkqXH14L"}],"key":"cIH9YHQogy"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"There are three axes along which policies can vary: their outputs,\ninputs, and time-dependence.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"mw9jsWyTg0"}],"key":"mLpyR9z8lZ"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"strong","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"Deterministic or stochastic.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"sayfLOZiue"}],"key":"d3oSRmI2aw"},{"type":"text","value":" A deterministic policy outputs\nactions while a stochastic policy outputs ","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"FbV5B1tELA"},{"type":"emphasis","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"distributions","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"q37tKkrGOs"}],"key":"jeL2J2PB8X"},{"type":"text","value":" over\nactions.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"zLX4FjLp7S"}],"key":"Vbi8Pr0MfJ"}],"key":"xnMLVsmUkf"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","alt":"A deterministic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"nJoQjlzSto","urlSource":"./shared/deterministic_policy.png","urlOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"children":[{"type":"text","value":"A deterministic policy.","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"key":"W81ofAHu5X"}],"key":"LPC9TLYK7g"}],"key":"JRfGOzQmvG"}],"enumerator":"1.1","key":"Gv5GgpoCIY"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.png","alt":"A stochastic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"bZLdtWub0n","urlSource":"./shared/stochastic_policy.png","urlOptimized":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"A stochastic policy.","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"nHXwskdnWl"}],"key":"ykqKCop2PR"}],"key":"bpxOud25Ar"}],"enumerator":"1.2","key":"GYgLlmuYdg"},{"type":"list","ordered":true,"start":2,"spread":false,"position":{"start":{"line":227,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":227,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"strong","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"children":[{"type":"text","value":"State-dependent or history-dependent.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"DPS8mA4LqU"}],"key":"dptu7fQY5r"},{"type":"text","value":" A state-dependent (a.k.a.\n“Markovian”) policy only depends on the current state, while a\nhistory-dependent policy depends on the sequence of past states,\nactions, and rewards. We’ll only consider state-dependent policies\nin this course.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"k8zIM2jvqt"}],"key":"tUYFTkMhUM"}],"key":"FpYhrNeKoU"},{"type":"listItem","spread":true,"position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"strong","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Stationary or time-dependent.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"DqOPNuiQcp"}],"key":"OouTuexRgO"},{"type":"text","value":" A stationary (a.k.a. time-homogeneous) policy\nremains the same function at all time steps, while a time-dependent policy can depend on the current timestep.\nFor consistency with states and actions, we will denote the timestep as a subscript,\ni.e. ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"YaJir7AMlp"},{"type":"inlineMath","value":"\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"π={π0,,πH1}.\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.π={π0,,πH1}.","key":"K3mU6CvHu6"}],"key":"HbeXXmQJ1y"}],"key":"BZs9N21KvR"}],"key":"oCLLLVKlAA"}],"enumerator":"1.3","html_id":"policy","key":"NdI4raKRnY"}],"key":"ySzrpNExFo"},{"type":"block","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":241,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Note that for finite state and action spaces,\nwe can represent a randomized mapping ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"p4anMLL0XQ"},{"type":"inlineMath","value":"\\mathcal{S} \\to \\Delta(\\mathcal{A})","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"SΔ(A)\\mathcal{S} \\to \\Delta(\\mathcal{A})SΔ(A)","key":"YiaFaShIMQ"},{"type":"text","value":"\nas a matrix ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"GzKULNyjUe"},{"type":"inlineMath","value":"\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"π[0,1]S×A\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}π[0,1]S×A","key":"DbHpcrK74f"},{"type":"text","value":" where each row describes\nthe policy’s distribution over actions for the corresponding state.","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"tbAm4bBEv4"}],"key":"GI20am4lzo"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy!\nIntuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision.\nWe’ll prove this result constructively later in the chapter.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"Lvbmf6eV3u"}],"key":"hCLB65FjCB"},{"type":"proof","kind":"example","label":"tidy_policy","identifier":"tidy_policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies for the tidying MDP","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"ftab6j0rfQ"}],"key":"yDdw3YuCyc"},{"type":"paragraph","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"text","value":"Here are some possible policies for the tidying MDP ","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"MArc0HaJLm"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_mdp","label":"tidy_mdp","children":[{"type":"text","value":"Example ","key":"NONxp259rB"},{"type":"text","value":"1.1","key":"fVRmhOIU6Y"}],"template":"Example %s","enumerator":"1.1","resolved":true,"html_id":"tidy-mdp","key":"IkFJ4jOsJK"},{"type":"text","value":":","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"v7jRW1NVFu"}],"key":"i8LSA85tBP"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":255,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":255,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"children":[{"type":"text","value":"Always tidy: ","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"EnbipLbQZP"},{"type":"inlineMath","value":"\\pi(s) = \\text{tidy}","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"html":"π(s)=tidy\\pi(s) = \\text{tidy}π(s)=tidy","key":"OmYj4CZdEJ"},{"type":"text","value":".","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"zIpcHy80xw"}],"key":"LLA7UoUqwQ"}],"key":"r6NaWNMm3y"},{"type":"listItem","spread":true,"position":{"start":{"line":257,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":257,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"Only tidy on weekends: ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"e7z6q35PeE"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{tidy}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=tidy\\pi_\\hi(s) = \\text{tidy}πh(s)=tidy","key":"E7NuxKkcwB"},{"type":"text","value":" if\n","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"O0xMdIwFpw"},{"type":"inlineMath","value":"\\hi \\in \\{ 5, 6 \\}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"h{5,6}\\hi \\in \\{ 5, 6 \\}h{5,6}","key":"KRqnhtdsJ0"},{"type":"text","value":" and ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"hHt2H8Jqmb"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{ignore}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=ignore\\pi_\\hi(s) = \\text{ignore}πh(s)=ignore","key":"ROMtZ3aJvi"},{"type":"text","value":" otherwise.","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"WFzW9MMX1n"}],"key":"fGDDcLz3pR"}],"key":"oR5eMf0A1k"},{"type":"listItem","spread":true,"position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"text","value":"Only tidy if the room is messy: ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"WcL25T0bVS"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{messy}) = \\text{tidy}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(messy)=tidy\\pi_\\hi(\\text{messy}) = \\text{tidy}πh(messy)=tidy","key":"ltty6yIdiR"},{"type":"text","value":"\nand ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"oL2nG9osNq"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{orderly}) = \\text{ignore}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(orderly)=ignore\\pi_\\hi(\\text{orderly}) = \\text{ignore}πh(orderly)=ignore","key":"wGKqoJV0sR"},{"type":"text","value":" for all ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"FqnItqJEXr"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"h\\hih","key":"TnbqMvavWL"},{"type":"text","value":".","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"ZUamQMSAvw"}],"key":"IFD7602ekf"}],"key":"QJqs6wax90"}],"key":"j6G4xxLwYu"}],"enumerator":"1.2","html_id":"tidy-policy","key":"UJHjSAJCkD"}],"key":"JG0DGjwzpe"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# arrays of shape (H, S, A) represent time-dependent policies\ntidy_policy_always_tidy = (\n jnp.zeros((7, 2, 2))\n .at[:, :, 1].set(1.0)\n)\ntidy_policy_weekends = (\n jnp.zeros((7, 2, 2))\n .at[5:7, :, 1].set(1.0)\n .at[0:5, :, 0].set(1.0)\n)\ntidy_policy_messy_only = (\n jnp.zeros((7, 2, 2))\n .at[:, 1, 1].set(1.0)\n .at[:, 0, 0].set(1.0)\n)","key":"ZPbqYroJQh"},{"type":"output","id":"P8_4pfkI-HviwI48MQCx_","data":[],"key":"BwaGGUZft4"}],"data":{},"key":"B8NUl7sfob"},{"type":"block","children":[{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"Q1jezJqC8H"}],"key":"ZxiDN56Jmn"},{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":285,"column":1}},"children":[{"type":"text","value":"Array objects in Jax are ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"RlkLkCgh9c"},{"type":"strong","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"immutable,","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"yoCWai1Z2u"}],"key":"zcuOmZZq7U"},{"type":"text","value":" that is, they cannot be ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"zyHDrLsy3t"},{"type":"emphasis","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"changed.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"SbaSH9hWeY"}],"key":"IxnAxsW1Sv"},{"type":"text","value":"\nThis might seem inconvenient, but in larger projects,\nimmutability makes code much easier to reason about.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"dfDZ4V9Ldz"}],"key":"zaKPVZtpmm"}],"key":"ETMnbgdYwj"}],"key":"Es0rUGpAuw"},{"type":"block","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Trajectories","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"TU9PudwrCj"}],"label":"trajectories","identifier":"trajectories","html_id":"trajectories","enumerator":"1.2.3","key":"oBv74XCRzw"},{"type":"proof","kind":"definition","label":"trajectory","identifier":"trajectory","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"oml6zLjvT2"}],"key":"CSVNtb79QO"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"A sequence of states, actions, and rewards is called a ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"BRZEURv3qv"},{"type":"strong","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"sswSezoRAi"}],"key":"vOVVmuYeDM"},{"type":"text","value":":","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"A8hKjYv1BT"}],"key":"hUsupSwQ44"},{"type":"math","value":"\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"τ=(s0,a0,r0,,sH1,aH1,rH1)\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})τ=(s0,a0,r0,,sH1,aH1,rH1)","enumerator":"1.5","key":"qAo81S0ZG8"},{"type":"paragraph","position":{"start":{"line":300,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"kEAvUzAaAS"},{"type":"inlineMath","value":"r_\\hi = r(s_\\hi, a_\\hi)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"rh=r(sh,ah)r_\\hi = r(s_\\hi, a_\\hi)rh=r(sh,ah)","key":"koVI2dgVs6"},{"type":"text","value":".\n(Note that some sources omit the reward at the final time step. This is a minor detail.)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"sMeHTwmTLi"}],"key":"GhV0wiaqLY"}],"enumerator":"1.4","html_id":"trajectory","key":"b2755S5tZH"}],"key":"VR7EBwLjl3"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Transition(NamedTuple):\n \"\"\"A single state-action-reward interaction with the environment.\n\n A trajectory comprises a sequence of transitions.\n \"\"\"\n s: int\n a: int\n r: float","key":"VjeSHHfSRf"},{"type":"output","id":"ID-7VHa7fBi5Zy6OBhkQI","data":[],"key":"foADudGMfb"}],"data":{},"key":"JPqV2eHejn"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Once we’ve chosen a policy,\nwe can sample trajectories by repeatedly choosing actions according to the policy,\ntransitioning according to the state transitions, and observing the rewards.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"Ie7SI9XyHX"}],"key":"WHepnoOsZ6"},{"type":"image","url":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.png","width":"240px","align":"center","key":"v0ueWWME8H","urlSource":"shared/trajectory.png","urlOptimized":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.webp"},{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"text","value":"That is, a policy induces a distribution ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"k9OzkJm9Oa"},{"type":"inlineMath","value":"\\rho^{\\pi}","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"ρπ\\rho^{\\pi}ρπ","key":"v97Wbp2kH3"},{"type":"text","value":" over trajectories.\n(We assume that ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"Ds2JxJ4HBA"},{"type":"text","value":"μ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"Aj2RLUhhK7"},{"type":"text","value":" and ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"MLCVAfwebT"},{"type":"inlineMath","value":"P","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"PPP","key":"LHYE5UqLUy"},{"type":"text","value":" are clear from context.)","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"B4IT2XnBy9"}],"key":"XJP7hxK4nn"},{"type":"proof","kind":"example","label":"tidy_traj","identifier":"tidy_traj","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories in the tidying environment","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"cHyXVTvVWi"}],"key":"fv3GQvsiQs"},{"type":"paragraph","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"children":[{"type":"text","value":"Here is a possible trajectory for the tidying example:","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"key":"FRkWu6bB8Q"}],"key":"baFXJtF9Iy"},{"type":"container","kind":"table","children":[{"type":"table","position":{"start":{"line":333,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"h\\hih","key":"LBR8iKZNnj"}],"key":"RQDRh5X0lZ"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"axuX1taaJa"}],"key":"UI57Qm5xfZ"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"yTaGlmh65W"}],"key":"rEnaJq667I"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"2","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"MILSxWq910"}],"key":"pG10NiojfM"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"RyZBsK6kaf"}],"key":"VNLUaLCZuR"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"MSUYiQcVFJ"}],"key":"NhhVm1KI0f"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"PX0kQAzEcq"}],"key":"UsobVc7bE6"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"6","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"SCALg48D0k"}],"key":"xFbHpGLvGk"}],"key":"R317KvtIwW"},{"type":"tableRow","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"sss","key":"K5fMZ52uPp"}],"key":"Q5yMMWyoo9"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"vQCrSKOY0t"}],"key":"ZuuNDO5CB1"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"xJjXaqm1F1"}],"key":"GoQXq6CGLJ"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"HFA4ygPSDm"}],"key":"kDMgR8vWTn"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"khxKhJXouK"}],"key":"njuxwFOte9"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"fAxHG8CJ1M"}],"key":"e6wA0p5N9g"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"bapqcbcbAn"}],"key":"DSmkkkJpx1"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"q1PMqdf03R"}],"key":"oe8X7LzJkT"}],"key":"fwwUJjy9OQ"},{"type":"tableRow","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"html":"aaa","key":"UMCa6gqr1G"}],"key":"bnejfGaUMJ"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"guYnecYXyQ"}],"key":"PqMtsa5GNI"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"rkzyk4gs3y"}],"key":"VpwxqRteFa"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"iB8HBjJAUq"}],"key":"VsG1MgURys"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"SmPWCRJOOo"}],"key":"OJICgdtz2A"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"LJs3SlX2rz"}],"key":"LQe34vfDwu"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"zDKzyRQLm2"}],"key":"xD62tMA21z"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"tDBHyPiF4X"}],"key":"B3MvysQS8h"}],"key":"ikjPCO8gtb"},{"type":"tableRow","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"html":"rrr","key":"CTqt7TUHy5"}],"key":"Usf6xGG7vT"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"MgjYb2rCNv"}],"key":"O1bR3Yo95O"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"gZo87xOqc6"}],"key":"C3qnFeXoJa"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"pz7Eief45R"}],"key":"JGLBq2AKin"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"CsxedymgPk"}],"key":"ko9gstkHon"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"GOVY6h22eE"}],"key":"smFkfEqH6E"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"zwPgMeYet1"}],"key":"lWmB5fs2nd"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"oVec7JD3iy"}],"key":"tRJvW9IHv3"}],"key":"d6PdMa2bxu"}],"key":"I1z9VtbRSS"}],"enumerator":"1.1","key":"uRIpY5EQU4"},{"type":"paragraph","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"children":[{"type":"text","value":"Could any of the policies in ","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"sRDzoHTGIg"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"txNbU8fZh2"},{"type":"text","value":"1.2","key":"PON3K6OrwS"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"F7jRdaBAWR"},{"type":"text","value":" have generated this trajectory?","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"ccX3OvEOjW"}],"key":"gCGQ1XElch"}],"enumerator":"1.3","html_id":"tidy-traj","key":"eldLmCfJDG"},{"type":"paragraph","position":{"start":{"line":343,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"Note that for a state-dependent policy, using the Markov property ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"Ls03hFVovP"},{"type":"crossReference","kind":"proof:definition","identifier":"markov","label":"markov","children":[{"type":"text","value":"Definition ","key":"yFy7v430sH"},{"type":"text","value":"1.1","key":"xBPrRf4WXh"}],"template":"Definition %s","enumerator":"1.1","resolved":true,"html_id":"markov","key":"eHafE1iNUD"},{"type":"text","value":",\nwe can write down the likelihood function of this probability distribution in an ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"eRbKZf4zyN"},{"type":"strong","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"autoregressive","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"wI2cfLR9M0"}],"key":"tPpZVhF8x0"},{"type":"text","value":" way (i.e. one timestep at a time):","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"U7a2O8ScQV"}],"key":"a8KLOe9Aae"},{"type":"proof","kind":"definition","label":"autoregressive_trajectories","identifier":"autoregressive_trajectories","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Autoregressive trajectory distribution","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"Md4dfcbDMS"}],"key":"oEl7x2MtYV"},{"type":"math","value":"\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)","enumerator":"1.6","key":"Fza4ShZyY3"}],"enumerator":"1.5","html_id":"autoregressive-trajectories","key":"tNq86JzqUl"}],"key":"rlDVsRAEzI"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def trajectory_log_likelihood(\n mdp: MDP,\n τ: list[Transition],\n π: Float[Array, \"S A\"],\n) -> float:\n \"\"\"Compute the log-likelihood of a trajectory under a given MDP and policy.\"\"\"\n\n # initial distribution and action\n total = jnp.log(mdp.μ[τ[0].s])\n total += jnp.log(π[τ[0].s, τ[0].a])\n\n # remaining state transitions and actions\n for i in range(1, mdp.H):\n total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])\n total += jnp.log(π[τ[i].s, τ[i].a])\n\n return total","key":"sOjjx1iST8"},{"type":"output","id":"nxjf8d5HG6zfj5xHXdT6a","data":[],"key":"x97p77VQkY"}],"data":{},"key":"L0J9CJ1sab"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"mpTyGdqmAc"}],"key":"PQkdzSmmcn"},{"type":"paragraph","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"How would you modify this to include stochastic rewards?","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"key":"WpJm2MobTb"}],"key":"UY75U6yO2u"}],"key":"JYVDQMrSxA"},{"type":"paragraph","position":{"start":{"line":376,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"For a deterministic policy ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"E2E0Pu7trp"},{"type":"text","value":"π","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"OvASHSZkVF"},{"type":"text","value":", we have that ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"Ke9JF3B2a6"},{"type":"inlineMath","value":"\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"πh(as)=I[a=πh(s)]\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]πh(as)=I[a=πh(s)]","key":"K2TnT5XmpN"},{"type":"text","value":";\nthat is, the probability of taking an action is ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"Lwjkcz5oaK"},{"type":"text","value":"1","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"EV5uJM4f9U"},{"type":"text","value":" if it’s the unique action prescribed by the policy for that state and ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"vLJTbnU4cJ"},{"type":"text","value":"0","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"tzigZNbkLP"},{"type":"text","value":" otherwise.\nIn this case, the only randomness in sampling trajectories comes from the initial state distribution ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"y8cVLSNvTd"},{"type":"text","value":"μ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"Ib7wEbf1HC"},{"type":"text","value":" and the state transitions ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"pl9B6nA1qy"},{"type":"inlineMath","value":"P","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"PPP","key":"hnWXi94v0g"},{"type":"text","value":".","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"ed1MKnogAt"}],"key":"seDaV2dE9q"}],"key":"WhdiFXwrFQ"},{"type":"block","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"children":[{"type":"text","value":"Value functions","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"key":"pAK9Agspzc"}],"identifier":"value-functions","label":"Value functions","html_id":"value-functions","implicit":true,"enumerator":"1.2.4","key":"U13YbWD2fQ"},{"type":"paragraph","position":{"start":{"line":384,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"The main goal of RL is to find a policy that maximizes the expected total\nreward ","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"UUgHzdis5M"},{"type":"inlineMath","value":"\\E [r_0 + \\cdots + r_{\\hor-1}]","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"html":"E[r0++rH1]\\E [r_0 + \\cdots + r_{\\hor-1}]E[r0++rH1]","key":"VtDJFPYsya"},{"type":"text","value":".","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"VocABv8f78"}],"key":"K5ldjIt346"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"RrgHvRtLSa"}],"key":"JIPq4VA2hC"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"Note that ","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"hGpABc0K3X"},{"type":"inlineMath","value":"r_0 + \\cdots + r_{\\hor-1}","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"html":"r0++rH1r_0 + \\cdots + r_{\\hor-1}r0++rH1","key":"VIYFAKHkqT"},{"type":"text","value":" is a random variable.\nWhat sources of randomness does it depend on?\nDescribe the generating process.","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"Jyt4fr4aZU"}],"key":"DbzoqaSaX0"}],"key":"L94QQlYTq6"},{"type":"paragraph","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"children":[{"type":"text","value":"Let’s introduce some notation for analyzing this quantity.","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"oQZxOW17z2"}],"key":"BIpkIRxPsU"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"A policy’s ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"FuJTiD3wEk"},{"type":"strong","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"OzBHUBUlGL"}],"key":"Fy6KMYXVEz"},{"type":"text","value":" at time ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"BZaMgLft6W"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"h\\hih","key":"CwzfTD1Ker"},{"type":"text","value":" is its expected remaining reward ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"fmnbj1VOFD"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"from a given state","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"lE1O9SecCE"}],"key":"a0QsjhTAPS"},{"type":"text","value":":","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"KQq9FOLbfq"}],"key":"rDuENXXFBd"},{"type":"proof","kind":"definition","label":"value","identifier":"value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value function","position":{"start":{"line":397,"column":1},"end":{"line":397,"column":1}},"key":"RgSrkEAMgK"}],"key":"rwDYNOYSHg"},{"type":"math","value":"V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"Vhπ(s):=Eτρπ[rh++rH1sh=s]V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]Vhπ(s):=Eτρπ[rh++rH1sh=s]","enumerator":"1.7","key":"xdDt2SGWQ8"}],"enumerator":"1.6","html_id":"value","key":"RktCcZYKLI"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"Similarly, we can define the ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"rAXj0bk4FG"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"action-value function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"PSVdywCvJN"}],"key":"BayznwMj8j"},{"type":"text","value":" (aka the\n","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"S6AjXfezL5"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Q-function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"OwF66As2l2"}],"key":"q8U6mn3yeh"},{"type":"text","value":") at time ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"igkxnOkFf6"},{"type":"inlineMath","value":"h","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"html":"hhh","key":"FabcrJ9KrG"},{"type":"text","value":" as the expected remaining reward ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Sm3VnlncFi"},{"type":"emphasis","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"from a given state and taking a given action","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"GQBiTpZ14i"}],"key":"xYALAQ8umI"},{"type":"text","value":":","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"B7J1oBSd23"}],"key":"maE0dECcZK"},{"type":"proof","kind":"definition","label":"action_value","identifier":"action_value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Action-value function","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"sHBbO5RQlc"}],"key":"wHUSsKpVGc"},{"type":"math","value":"Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"html":"Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]","enumerator":"1.8","key":"NXJPfRFGoA"}],"enumerator":"1.7","html_id":"action-value","key":"G40OiMbwbq"}],"key":"pShUC0B3ck"},{"type":"block","position":{"start":{"line":412,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"Relating the value function and action-value function","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"oThg0GYyoc"}],"identifier":"relating-the-value-function-and-action-value-function","label":"Relating the value function and action-value function","html_id":"relating-the-value-function-and-action-value-function","implicit":true,"enumerator":"1.2.4.1","key":"sdg6gfdXkK"},{"type":"paragraph","position":{"start":{"line":416,"column":1},"end":{"line":417,"column":1}},"children":[{"type":"text","value":"Note that the value function is just the expected action-value over\nactions drawn from the policy:","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"AjFrzCoUf2"}],"key":"kYXvwTlYRj"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]","position":{"start":{"line":419,"column":1},"end":{"line":419,"column":1}},"html":"Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]Vhπ(s)=Eaπh(s)[Qhπ(s,a)]","enumerator":"1.9","key":"kyP4cbQKtY"}],"key":"CzftdFNh4Y"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_v(\n policy: Float[Array, \"S A\"],\n q: Float[Array, \"S A\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Compute the value function for a given policy in a known finite MDP\n at a single timestep from its action-value function.\n \"\"\"\n return jnp.average(q, weights=policy, axis=1)","key":"FrZA1ILcT2"},{"type":"output","id":"HfkUdII9PepamIJ08fcRU","data":[],"key":"xENzM3j8le"}],"data":{},"key":"lNoXHeKcrt"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":433,"column":1},"end":{"line":434,"column":1}},"children":[{"type":"text","value":"and the action-value is the sum of the immediate reward and the expected value of the following\nstate:","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"cubrr5jIiG"}],"key":"J0CGgD7Hwv"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]","enumerator":"1.10","key":"UqX0bBY34B"}],"key":"abQztaHi0R"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def v_to_q(\n mdp: MDP,\n v_next: Float[Array, \" S\"],\n) -> Float[Array, \"S A\"]:\n \"\"\"\n Compute the action-value function in a known finite MDP\n at a single timestep from the corresponding value function.\n \"\"\"\n # the discount factor is relevant later\n return mdp.r + mdp.γ * mdp.P @ v_next\n\n\n# convert a list of v functions to a list of q functions\nv_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))","key":"LQhf1lZLZe"},{"type":"output","id":"Nu_ULpTAYytwf05gGZ-Au","data":[],"key":"Bg5Dr6N5Fr"}],"data":{},"key":"VmNu0hmWEQ"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Greedy policies","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"wnEy5QtXpO"}],"identifier":"greedy-policies","label":"Greedy policies","html_id":"greedy-policies","implicit":true,"enumerator":"1.2.4.2","key":"MumOokY3u3"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"For any given ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"OjfIsd7ldb"},{"type":"inlineMath","value":"Q \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QRS×AQ \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}QRS×A","key":"DHJC00AQWQ"},{"type":"text","value":", we can define the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"PRtJawRyen"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"greedy policy","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"llzUn6bWrT"}],"key":"pYCHLx84kq"},{"type":"text","value":" ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"stytsA21OX"},{"type":"inlineMath","value":"\\hat \\pi_Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"π^Q\\hat \\pi_Qπ^Q","key":"HWcDyFYJvw"},{"type":"text","value":" as the deterministic policy that selects the action with the highest ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"ChYXmVrfaC"},{"type":"inlineMath","value":"Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QQQ","key":"ntK4UI9byo"},{"type":"text","value":"-value at each state:","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"eN4bdKLYVV"}],"key":"QbPPOTmFuB"},{"type":"math","value":"\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}","position":{"start":{"line":459,"column":1},"end":{"line":461,"column":1}},"html":"π^Q(s)=argmaxaQsa\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}π^Q(s)=argamaxQsa","enumerator":"1.11","key":"RyWMBp74fV"}],"key":"Pt6doVcZwP"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_greedy(q: Float[Array, \"S A\"]) -> Float[Array, \"S A\"]:\n \"\"\"\n Get the (deterministic) greedy policy with respect to an action-value function.\n Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.\n \"\"\"\n A = q.shape[1]\n a_ary = jnp.argmax(q, axis=1)\n return jnp.eye(A)[a_ary]\n\n\ndef v_to_greedy(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \"S A\"]:\n \"\"\"Get the (deterministic) greedy policy with respect to a value function.\"\"\"\n return q_to_greedy(v_to_q(mdp, v))","key":"u3pK9tNFsc"},{"type":"output","id":"X-weAlLmbQ2g2iN-y5otu","data":[],"key":"Ge4b0YZXH6"}],"data":{},"key":"c1ArbkLrNW"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"text","value":"The one-step (Bellman) consistency equation","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"NSdWORoami"}],"identifier":"the-one-step-bellman-consistency-equation","label":"The one-step (Bellman) consistency equation","html_id":"the-one-step-bellman-consistency-equation","implicit":true,"enumerator":"1.2.5","key":"Hh0wErIFmJ"},{"type":"paragraph","position":{"start":{"line":481,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that by simply considering the cumulative reward as the sum of the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"TfEesLN8QA"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"R6I72Lpaoe"}],"key":"XasqoFyyDp"},{"type":"text","value":" reward and the ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"Hh61Meprv8"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"future","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"B5M0nhmQwr"}],"key":"nCD5CVaEDf"},{"type":"text","value":" cumulative reward, we can describe the\nvalue function recursively (in terms of itself). This is named the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"RdoPEVnla1"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"UGloC6AuXA"}],"key":"tBaKXppDFs"},{"type":"text","value":" after ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"PQrxMKRzMM"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Richard Bellman","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"dCKz1lQjmS"}],"key":"Jt9HVViEJp"},{"type":"text","value":" (1920--1984),\nwho is credited with introducing dynamic programming in 1953.","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"VgQ1twywGh"}],"key":"DqAZkrAAeK"},{"type":"proof","kind":"theorem","label":"bellman_consistency","identifier":"bellman_consistency","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for the value function","position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"key":"t4j9kAsr7M"}],"key":"yDP1g4dqbM"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":490,"column":1},"end":{"line":492,"column":1}},"html":"Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]","enumerator":"1.12","key":"M911ptkvJr"}],"enumerator":"1.1","html_id":"bellman-consistency","key":"aj7ygkU2it"}],"key":"eqzj1slPRW"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def check_bellman_consistency_v(\n mdp: MDP,\n policy: Float[Array, \"H S A\"],\n v_ary: Float[Array, \"H S\"],\n) -> bool:\n \"\"\"\n Check that the given (time-dependent) \"value function\"\n satisfies the Bellman consistency equation.\n \"\"\"\n return all(\n jnp.allclose(\n # lhs\n v_ary[h],\n # rhs\n jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),\n )\n for h in range(mdp.H - 1)\n )","key":"yiZT7k9BLu"},{"type":"output","id":"_Ex0Fz7xaMyUjsNxz5bLL","data":[],"key":"EPXqzdN51j"}],"data":{},"key":"jwDYd6HDTf"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"PgQjGgqbt5"}],"key":"Ax3snZKQVg"},{"type":"paragraph","position":{"start":{"line":517,"column":1},"end":{"line":518,"column":1}},"children":[{"type":"text","value":"Verify that this equation holds by expanding ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"zoGYRrGyEe"},{"type":"inlineMath","value":"V_\\hi^\\pi(s)","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vhπ(s)V_\\hi^\\pi(s)Vhπ(s)","key":"iUwzn5k8hW"},{"type":"text","value":"\nand ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"kMUy3aPCxV"},{"type":"inlineMath","value":"V_{\\hi+1}^\\pi(s')","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vh+1π(s)V_{\\hi+1}^\\pi(s')Vh+1π(s)","key":"Pcgh6U649U"},{"type":"text","value":".","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"b36gwJgJLx"}],"key":"XyVOn26v9m"}],"key":"vXzi7EOjc8"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":522,"column":1}},"children":[{"type":"text","value":"One can analogously derive the Bellman consistency equation for the\naction-value function:","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"JHa1ki3idm"}],"key":"TcTO3QvuTy"},{"type":"proof","kind":"theorem","label":"bellman_consistency_action","identifier":"bellman_consistency_action","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for action-values","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"vbBdWDdLhe"}],"key":"UDqWlOQOdY"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]","enumerator":"1.13","key":"oWuBHoRibN"}],"enumerator":"1.2","html_id":"bellman-consistency-action","key":"Ik4oZRiRI8"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"lxN4LVxFo9"}],"key":"sBl3Y4lU33"},{"type":"paragraph","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"children":[{"type":"text","value":"Write a ","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"tXB7uOjKGz"},{"type":"inlineCode","value":"check_bellman_consistency_q","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"kiQiHevkLq"},{"type":"text","value":" function for the action-value function.","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"qjYFyWj5bs"}],"key":"Vd7kg7s49a"}],"key":"rHNG3cioJF"},{"type":"proof","kind":"remark","label":"bellman_det","identifier":"bellman_det","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman consistency equation for deterministic policies","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"oSwj0d23uZ"}],"key":"epOuWXL8MD"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Note that for deterministic policies, the Bellman consistency equation\nsimplifies to","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"ZGroFKhdPZ"}],"key":"YbHk9xSN6Y"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}","position":{"start":{"line":540,"column":1},"end":{"line":545,"column":1}},"html":"Vhπ(s)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]Qhπ(s,a)=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}Vhπ(s)Qhπ(s,a)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]","enumerator":"1.14","key":"onIEWT2oro"}],"enumerator":"1.1","html_id":"bellman-det","key":"G9kMrDFz0S"}],"key":"fySVwxlBaI"},{"type":"block","position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"The one-step Bellman operator","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"Ws4d30HsgB"}],"identifier":"the-one-step-bellman-operator","label":"The one-step Bellman operator","html_id":"the-one-step-bellman-operator","implicit":true,"enumerator":"1.2.6","key":"tQqA1FReM1"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":554,"column":1}},"children":[{"type":"text","value":"Fix a policy ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"ood3jrhhiQ"},{"type":"text","value":"π","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"csVRYKa1rC"},{"type":"text","value":". Consider the higher-order operator that takes in a\n“value function” ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"wvQTfvWzwY"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"mcCc8qtVRE"},{"type":"text","value":" and returns the r.h.s. of the Bellman\nequation for that “value function”:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"Ds4akFGb0G"}],"key":"ZlMeeC1WKh"},{"type":"proof","kind":"definition","label":"bellman_operator","identifier":"bellman_operator","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":556,"column":1},"end":{"line":556,"column":1}},"key":"VKZ7Ke7eyD"}],"key":"o9ESmfjlbq"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].","enumerator":"1.15","key":"geGGoDGniH"},{"type":"paragraph","position":{"start":{"line":561,"column":1},"end":{"line":564,"column":1}},"children":[{"type":"text","value":"This is a crucial tool for reasoning about MDPs.\nIntuitively, it answers the following question:\nif we evaluate the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"T58Qho21zX"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"next","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"C5RyFqd31i"}],"key":"ZarRlTtChM"},{"type":"text","value":" state using ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"peTgZoppEA"},{"type":"inlineMath","value":"v","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"html":"vvv","key":"ZYEsuZENZF"},{"type":"text","value":",\nhow good is the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"YFrXaBLhC3"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"Z0DJaPVt5Q"}],"key":"tnma44jZHB"},{"type":"text","value":" state, according to the given policy?","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"TzLupPAQsb"}],"key":"XJV12lAnDj"}],"enumerator":"1.8","html_id":"bellman-operator","key":"QkfB9bUfAY"}],"key":"q0UTyXvzu1"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator_looping(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Looping definition of the Bellman operator.\n Concise version is below\n \"\"\"\n v_new = jnp.zeros(mdp.S)\n for s in range(mdp.S):\n for a in range(mdp.A):\n for s_next in range(mdp.S):\n v_new[s] += (\n policy[s, a]\n * mdp.P[s, a, s_next]\n * (mdp.r[s, a] + mdp.γ * v[s_next])\n )\n return v_new","visibility":"hide","key":"rrVMX9XdvN"},{"type":"output","id":"NJNWRcvX3cQdCUn9DueC9","data":[],"visibility":"show","key":"NItgt6q4mK"}],"data":{"tags":[]},"visibility":"show","key":"ilNaWeBpfa"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Note that we can concisely implement this using the ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"PoqX4P8AMX"},{"type":"inlineCode","value":"q_to_v","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"nxu1L4RPGa"},{"type":"text","value":" and ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"NUw95fjMrP"},{"type":"inlineCode","value":"v_to_q","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"ZpC6TdHMjZ"},{"type":"text","value":" utilities from above:","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"V7oRjcTyrO"}],"key":"ZUeEGiU6zP"}],"key":"a9bIFGNd6s"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"For a known finite MDP, the Bellman operator can be exactly evaluated.\"\"\"\n return q_to_v(policy, v_to_q(mdp, v)) # equivalent\n return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)","key":"wPYD9wWfF0"},{"type":"output","id":"HpY8nXQf7aL8_8e7N5xDc","data":[],"key":"TTxNZjerwg"}],"data":{},"key":"tH4YWBFkwn"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":604,"column":1},"end":{"line":608,"column":1}},"children":[{"type":"text","value":"We’ll call ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"XSaITtyc6n"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"Jπ:RSRS\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}Jπ:RSRS","key":"WfTdyZUwQJ"},{"type":"text","value":" the ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"CmH8VY4YjV"},{"type":"strong","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"children":[{"type":"text","value":"Bellman\noperator","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"gC32RHZURv"}],"key":"Gcw4cl4Nu6"},{"type":"text","value":" of ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"RNrc1ORjhX"},{"type":"text","value":"π","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"JC1OhLKCLP"},{"type":"text","value":".\nNote that it’s defined on any “value function” mapping states to real numbers;\n","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"DngY5Eumt2"},{"type":"inlineMath","value":"v","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"vvv","key":"CuoCXYz79d"},{"type":"text","value":" doesn’t have to be a well-defined value function for some policy (hence the lowercase notation).\nThe Bellman operator also gives us a concise way to express ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"VaKEmUDBqw"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"QgrwypmFvI"},{"type":"text","value":"1.1","key":"W6IYzC6gfx"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"TrStES1s77"},{"type":"text","value":" for the value function:","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"ds2ioGQbaq"}],"key":"xr45nIdlWb"},{"type":"math","value":"V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)","position":{"start":{"line":610,"column":1},"end":{"line":610,"column":1}},"html":"Vhπ=Jπ(Vh+1π)V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)Vhπ=Jπ(Vh+1π)","enumerator":"1.16","key":"hGhLr8gloW"},{"type":"paragraph","position":{"start":{"line":612,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Intuitively, the output of the Bellman operator, a new “value function”,\nevaluates states as follows: from a given state, take one action\naccording to ","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"Cw8uXWvJ8h"},{"type":"text","value":"π","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"QOOpR9TStu"},{"type":"text","value":", observe the reward, and then evaluate the next state\nusing the input “value function”.","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"JuEqVU4u5l"}],"key":"sK4bMMz9Z1"},{"type":"paragraph","position":{"start":{"line":617,"column":1},"end":{"line":619,"column":1}},"children":[{"type":"text","value":"When we discuss infinite-horizon MDPs, the Bellman operator will turn\nout to be more than just a notational convenience: We’ll use it to\nconstruct algorithms for computing the optimal policy.","position":{"start":{"line":617,"column":1},"end":{"line":617,"column":1}},"key":"IQEcGqrROY"}],"key":"TtXblaGdme"},{"type":"heading","depth":2,"position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Solving finite-horizon MDPs","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"tYyqU6TAyt"}],"label":"finite_horizon_mdps","identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","enumerator":"1.3","key":"Oz1TtIoRpC"},{"type":"heading","depth":3,"position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"children":[{"type":"text","value":"Policy evaluation in finite-horizon MDPs","position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"key":"F73hfHtb6g"}],"label":"eval_dp","identifier":"eval_dp","html_id":"eval-dp","enumerator":"1.3.1","key":"WnUcB2jJOW"},{"type":"paragraph","position":{"start":{"line":628,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"How can we actually compute the value function of a given policy? This\nis the task of ","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"jnpThwandg"},{"type":"strong","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"akF4fkFUB7"}],"key":"BibJv7OTmi"},{"type":"text","value":".","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"ppdzIMnl4n"}],"key":"FtOY5rz5DM"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to evaluate a policy in a finite-horizon MDP","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"o9pkBo4cMZ"}],"key":"okDFCHfZ5M"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation\n","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"usXyg4WH1K"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"gVxETJUuFh"},{"type":"text","value":"1.1","key":"VL8h8mzONk"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"VXJ9V6Fpen"},{"type":"text","value":"\ngives us a convenient algorithm for\nevaluating stationary policies: it expresses the value function at\ntimestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"JmOoCk4c34"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h\\hih","key":"V5EVPU8WWC"},{"type":"text","value":" as a function of the value function at timestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"RbWfZXuaQ5"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h+1\\hi+1h+1","key":"FoMCpoOUPu"},{"type":"text","value":". This\nmeans we can start at the end of the time horizon, where the value is\nknown, and work backwards in time, using the Bellman consistency\nequation to compute the value function at each time step.","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"rO9xENZ9ES"}],"key":"lVIAYyza71"}],"enumerator":"1.9","key":"cMcpMHcPyW"}],"key":"ZWzWXUZf2W"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def dp_eval_finite(mdp: MDP, policy: Float[Array, \"S A\"]) -> Float[Array, \"H S\"]:\n \"\"\"Evaluate a policy using dynamic programming.\"\"\"\n V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n for h in range(mdp.H - 1, -1, -1):\n V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])\n return jnp.stack(V_ary[:-1])","key":"ruK684AGXA"},{"type":"output","id":"iSeUWUW5q8mGwwfbUB12b","data":[],"key":"kQSoUPjJ8T"}],"data":{},"key":"Y7U6IFhFdC"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"This runs in time ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"XvoeiXTVIJ"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"DG6jDtEWTf"},{"type":"text","value":" by counting the\nloops.","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"yxMbVn1sFr"}],"key":"YTmZXXLjiE"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"c1ThCcPyn2"}],"key":"Ex5sJLJhPf"},{"type":"paragraph","position":{"start":{"line":656,"column":1},"end":{"line":657,"column":1}},"children":[{"type":"text","value":"Do you see where we compute ","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"qCIJHOjTOZ"},{"type":"inlineMath","value":"Q^\\pi_\\hi","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"html":"QhπQ^\\pi_\\hiQhπ","key":"JqgjqMLbaD"},{"type":"text","value":" along the way? Make\nthis step explicit.","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"MaqHxhh2TR"}],"key":"E437ICteHV"}],"key":"tzf0ZZECA8"},{"type":"proof","kind":"example","label":"tidy_eval_finite","identifier":"tidy_eval_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":660,"column":1},"end":{"line":660,"column":1}},"key":"pGs2Y4T95z"}],"key":"nBw117Lixa"},{"type":"paragraph","position":{"start":{"line":663,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"Let’s evaluate the policy from\n","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"smU1xd5MnI"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"Xk9GH1PXFc"},{"type":"text","value":"1.2","key":"iDo2pZG67n"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"FdWzuLd9e7"},{"type":"text","value":" in the tidying MDP\nthat tidies if and only if the room is\nmessy. We’ll use the Bellman consistency equation to compute the value\nfunction at each time step.","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"ODgNx5nGF8"}],"key":"YZ4PQVcVJt"},{"type":"math","value":"\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}","position":{"start":{"line":669,"column":1},"end":{"line":690,"column":1}},"html":"VH1π(orderly)=r(orderly,ignore)=1VH1π(messy)=r(messy,tidy)=0VH2π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7VH2π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1VH3π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49VH3π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}VH1π(orderly)VH1π(messy)VH2π(orderly)VH2π(messy)VH3π(orderly)VH3π(messy)=r(orderly,ignore)=1=r(messy,tidy)=0=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7","enumerator":"1.17","key":"H2F39hTIaX"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":693,"column":1}},"children":[{"type":"text","value":"etc. You may wish to repeat this computation for the\nother policies to get a better sense of this algorithm.","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"ZbezvsESUD"}],"key":"plxsaYIhXZ"}],"enumerator":"1.4","html_id":"tidy-eval-finite","key":"QMqBHJoynY"}],"key":"imkJMhzbIE"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)\nV_messy","key":"MnNT8MWLuo"},{"type":"output","id":"Mn3wzcTmz6v2dDOdk6riz","data":[{"output_type":"execute_result","execution_count":14,"metadata":{},"data":{"text/plain":{"content":"Array([[5.5621696, 4.7927704],\n [4.7927704, 4.0241003],\n [4.0241003, 3.253 ],\n [3.253 , 2.49 ],\n [2.49 , 1.7 ],\n [1.7 , 1. ],\n [1. , 0. ]], dtype=float32)","content_type":"text/plain"}}}],"key":"b2Mg3FxOd6"}],"data":{},"key":"dftOvrgaSA"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"Optimal policies in finite-horizon MDPs","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"OImdOoVz7c"}],"label":"opt_dynamic_programming","identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","enumerator":"1.3.2","key":"d34MV3C4rg"},{"type":"paragraph","position":{"start":{"line":704,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"We’ve just seen how to ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"OSybg8P10D"},{"type":"emphasis","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"DFFX7oIC2D"}],"key":"kvP8AXIuXa"},{"type":"text","value":" a given policy. But how can we find\nthe ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"cXiunsKu53"},{"type":"strong","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"C5CgXsVVE3"}],"key":"b42uH42rQs"},{"type":"text","value":" for a given environment?","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"nvm1rQXGqQ"}],"key":"X7HIFhZHnL"},{"type":"proof","kind":"definition","label":"optimal_policy_finite","identifier":"optimal_policy_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policies","position":{"start":{"line":707,"column":1},"end":{"line":707,"column":1}},"key":"xBSmopuV2X"}],"key":"RT6lT6YcCW"},{"type":"paragraph","position":{"start":{"line":710,"column":1},"end":{"line":712,"column":1}},"children":[{"type":"text","value":"We call a policy optimal, and denote it by ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"kwJeeajz12"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"html":"π\\pi^\\starπ","key":"fplKrLmANe"},{"type":"text","value":", if it does at\nleast as well as ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"h6Ir0f66HZ"},{"type":"emphasis","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"sqDGjBjdDa"}],"key":"bt4b0LD2J2"},{"type":"text","value":" other policy ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"WrRFAJgQtY"},{"type":"text","value":"π","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"Cx3konEde6"},{"type":"text","value":" (including stochastic and\nhistory-dependent ones) in all situations:","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"Ygzy2fm5Vw"}],"key":"BwGqWrhvMw"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}","position":{"start":{"line":714,"column":1},"end":{"line":719,"column":1}},"html":"Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]","enumerator":"1.18","key":"Qd49TCNBM3"},{"type":"paragraph","position":{"start":{"line":721,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"where we condition on the\ntrajectory up to time ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"vEgDZATgec"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"h\\hih","key":"MEKuxOLoyB"},{"type":"text","value":", denoted\n","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"TjOqMvUZ1C"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"nYM5unsro9"},{"type":"text","value":", where ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"nH3C6bC9xy"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"ouF2Ud7y4h"},{"type":"text","value":".","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"Qo7nFWlqKx"}],"key":"VyR5cnExnl"}],"enumerator":"1.10","html_id":"optimal-policy-finite","key":"vwDQQjH8SR"},{"type":"paragraph","position":{"start":{"line":726,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"Convince yourself that all optimal policies must have the same value\nfunction. We call this the ","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"KnPymXBrnV"},{"type":"strong","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"TSyWPiziAk"}],"key":"xfYD1tYlGH"},{"type":"text","value":" and denote it by\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"WZF9mOFqZI"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"X8Yae7sTQS"},{"type":"text","value":". The same goes for the action-value function\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"S7ZzKsDKCL"},{"type":"inlineMath","value":"Q_\\hi^\\star(s, a)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Qh(s,a)Q_\\hi^\\star(s, a)Qh(s,a)","key":"YsLTmdXJuJ"},{"type":"text","value":".","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"AdzvFQ5XHB"}],"key":"hO3ZJoJa9F"},{"type":"paragraph","position":{"start":{"line":731,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"It is a stunning fact that ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"Ft74FVlfUi"},{"type":"strong","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"every finite-horizon MDP has an optimal\npolicy that is time-dependent and deterministic.","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"qfaGv6ts5e"}],"key":"HZUSlyNKEw"},{"type":"text","value":" In particular, we can\nconstruct such a policy by acting ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"eC05hgckja"},{"type":"emphasis","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"greedily","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"VSy4C1Na1v"}],"key":"RcBtkpfXC2"},{"type":"text","value":" with respect to the optimal\naction-value function:","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"vetEP51oY2"}],"key":"bU0SnGfiF5"},{"type":"proof","kind":"theorem","label":"optimal_greedy","identifier":"optimal_greedy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"It is optimal to be greedy with respect to the optimal value function","position":{"start":{"line":737,"column":1},"end":{"line":737,"column":1}},"key":"Ja5rZPdZdf"}],"key":"PgXoqhbJIp"},{"type":"math","value":"\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).","position":{"start":{"line":740,"column":1},"end":{"line":740,"column":1}},"html":"πh(s)=argmaxaQh(s,a).\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).πh(s)=argamaxQh(s,a).","enumerator":"1.19","key":"IiBv8hoo53"}],"enumerator":"1.3","html_id":"optimal-greedy","key":"eZxwx0KiJ2"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"vHcp9yeNI5"}],"key":"Fn6n5MKyDx"},{"type":"paragraph","position":{"start":{"line":744,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"oVTw9ACTJM"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"VV^{\\star}V","key":"QFgR3FYVHH"},{"type":"text","value":" and ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"F1p9Oe17J4"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"QQ^{\\star}Q","key":"q9T6PYFDB8"},{"type":"text","value":" denote the optimal value and\naction-value functions. Consider the greedy policy","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"F3HaH2LK1u"}],"key":"QL9620iaQt"},{"type":"math","value":"\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"html":"π^h(s):=argmaxaQh(s,a).\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).π^h(s):=argamaxQh(s,a).","enumerator":"1.20","key":"eMw3y4WTRU"},{"type":"paragraph","position":{"start":{"line":749,"column":1},"end":{"line":750,"column":1}},"children":[{"type":"text","value":"We aim to show that\n","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"CNfRMOyHVG"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"π^\\hat \\piπ^","key":"u3fv5cBXbU"},{"type":"text","value":" is optimal; that is, ","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"SAY7PV9qFq"},{"type":"inlineMath","value":"V^{\\hat \\pi} = V^{\\star}","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"Vπ^=VV^{\\hat \\pi} = V^{\\star}Vπ^=V","key":"wTSBbgdTvU"},{"type":"text","value":".","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"EMsaVey4q7"}],"key":"qAaYXA7a5T"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"children":[{"type":"text","value":"Fix an arbitrary state ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"bf7jUpV0VB"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"cP5ghRqoFa"},{"type":"text","value":" and time ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"bz7PmLWm3E"},{"type":"inlineMath","value":"\\hi \\in [H]","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"h[H]\\hi \\in [H]h[H]","key":"U9AfDaWNAX"},{"type":"text","value":".","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"LwQt03vHr2"}],"key":"TR2X2FVeCA"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":759,"column":1}},"children":[{"type":"text","value":"Firstly, by the definition of ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"GKHanetaYi"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VV^{\\star}V","key":"IT4VDRpFAV"},{"type":"text","value":", we already know\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"DlwFQ1ZwGG"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"F3CEAygt7M"},{"type":"text","value":". So for equality to hold we just\nneed to show that ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"uThi9if9OZ"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"J1hWW6XivG"},{"type":"text","value":". We’ll first\nshow that the Bellman operator ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"pkjeMSOzvI"},{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"hPSlaujeKN"},{"type":"text","value":" never decreases\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"mFDlgeUhRv"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"stJDAqoqip"},{"type":"text","value":". Then we’ll apply this result recursively to show that\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"JnjN4tu3ro"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"UcIvCvFB7F"},{"type":"text","value":".","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"VoUMYPSL3G"}],"key":"EXXe9CSXwu"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator never decreases the optimal value function","position":{"start":{"line":761,"column":1},"end":{"line":761,"column":1}},"key":"QB2VLgi9mb"}],"key":"rI7oiqKSNW"},{"type":"paragraph","position":{"start":{"line":762,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"XQt94w5Zbn"},{"type":"text","value":" never decreases ","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"P87c77hcVT"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"Hr9SbPGvwN"},{"type":"text","value":"\n(elementwise):","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"SjD8a1nuCB"}],"key":"EXOimJaw4Y"},{"type":"math","value":"[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"[Jπ^(Vh+1)](s)Vh(s).[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).[Jπ^(Vh+1)](s)Vh(s).","enumerator":"1.21","key":"MYvwpo6ECS"},{"type":"paragraph","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"strong","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"Proof:","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"Kb3kvMENz0"}],"key":"wYyiwGVEUQ"}],"key":"gFgtKKo98d"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}","position":{"start":{"line":769,"column":1},"end":{"line":777,"column":1}},"html":"Vh(s)=maxπΠVhπ(s)=maxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]Bellman consistencymaxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]definition of V=maxa[r(s,a)+EsP(s,a)Vh+1(s)]only depends on π via a=[Jπ^(Vh+1)](s).\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}Vh(s)=πΠmaxVhπ(s)=πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]=amax[r(s,a)+EsP(s,a)Vh+1(s)]=[Jπ^(Vh+1)](s).Bellman consistencydefinition of Vonly depends on π via a","enumerator":"1.22","key":"Qo4ZYsXg4s"},{"type":"paragraph","position":{"start":{"line":779,"column":1},"end":{"line":781,"column":1}},"children":[{"type":"text","value":"Note that the chosen action ","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"hGSYCSifhf"},{"type":"inlineMath","value":"a \\sim \\pi(\\dots)","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"html":"aπ()a \\sim \\pi(\\dots)aπ()","key":"msyReBInsW"},{"type":"text","value":" above\nmight depend on the past history; this isn’t shown in the notation and\ndoesn’t affect our result (make sure you see why).","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"AYF54lng7f"}],"key":"DWCVDkd2PG"}],"enumerator":"1.1","key":"CU11VFuZBf"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"We can now apply this result recursively to get","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"MH34NW4FAK"}],"key":"SN5ePHV5PZ"},{"type":"math","value":"V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)","position":{"start":{"line":786,"column":1},"end":{"line":786,"column":1}},"html":"Vt(s)Vtπ^(s)V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)Vt(s)Vtπ^(s)","enumerator":"1.23","key":"a2GPks4kPP"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"as follows. (Note that even\nthough ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"xMVMJx70l3"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"π^\\hat \\piπ^","key":"dHBASosY6c"},{"type":"text","value":" is deterministic, we’ll use the ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"kTDP4aNYjF"},{"type":"inlineMath","value":"a \\sim \\hat \\pi(s)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"aπ^(s)a \\sim \\hat \\pi(s)aπ^(s)","key":"MnoeWB43lC"},{"type":"text","value":"\nnotation to make it explicit that we’re sampling a trajectory from it.)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"RVkAvEA7tX"}],"key":"MKlKJFX4Ez"},{"type":"math","value":"\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}","position":{"start":{"line":792,"column":1},"end":{"line":802,"column":1}},"html":"Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]definition of Jπ^Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]above lemma=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+EsVt+2(s)]]definition of Jπ^apply at all timesteps=Eτρπ^[Gtsh=s]rewrite expectation=Vtπ^(s)definition\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+Es′′Vt+2(s′′)]]=Eτρπ^[Gtsh=s]=Vtπ^(s)definition of Jπ^above lemmadefinition of Jπ^apply at all timestepsrewrite expectationdefinition","enumerator":"1.24","key":"ODRTSv7QEH"},{"type":"paragraph","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"children":[{"type":"text","value":"And so we have ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"dfXgcuFO4q"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"zl7PKRa0Vw"},{"type":"text","value":", making ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"POihVnLTR4"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"π^\\hat \\piπ^","key":"DIQRsN3LFL"},{"type":"text","value":" optimal.","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"fQQ5eyErat"}],"key":"HwQWWQx8xU"}],"enumerator":"1.1","key":"R1z5Hll5uK"},{"type":"paragraph","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Note that this also gives simplified forms of the ","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"gbwxQsUMjH"},{"type":"crossReference","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Bellman consistency","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"knaIp4YF1l"}],"identifier":"bellman_consistency","label":"bellman_consistency","kind":"proof:theorem","template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"jKTSOMKTrN"},{"type":"text","value":" equations for the optimal policy:","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"VkPWYFDL08"}],"key":"fSfGvKhk8w"},{"type":"proof","kind":"corollary","label":"bellman_consistency_optimal","identifier":"bellman_consistency_optimal","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equations for the optimal policy","position":{"start":{"line":809,"column":1},"end":{"line":809,"column":1}},"key":"Xfak7PeWsK"}],"key":"RlAXu3eY64"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}","position":{"start":{"line":812,"column":1},"end":{"line":817,"column":1}},"html":"Vh(s)=maxaQh(s,a)Qh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}Vh(s)Qh(s,a)=amaxQh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]","enumerator":"1.25","key":"iyhmCqC5QC"}],"enumerator":"1.1","html_id":"bellman-consistency-optimal","key":"auRlXQgxnq"},{"type":"paragraph","position":{"start":{"line":820,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Now that we’ve shown this particular greedy policy is optimal, all we\nneed to do is compute the optimal value function and optimal policy. We\ncan do this by working backwards in time using ","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"vlothh3wpa"},{"type":"strong","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"UYDIR1kG0v"}],"key":"Hgi72B8dPW"},{"type":"text","value":"\n(DP).","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"zpr6Hz5Msd"}],"key":"ePif5oBnh0"},{"type":"proof","kind":"definition","label":"pi_star_dp","identifier":"pi_star_dp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to compute an optimal policy in a finite-horizon MDP","position":{"start":{"line":825,"column":1},"end":{"line":825,"column":1}},"key":"Og3h7scRBx"}],"key":"o7Ia8nB4II"},{"type":"paragraph","position":{"start":{"line":828,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"strong","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Base case.","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"FWr98kBBxW"}],"key":"AUqlDl975f"},{"type":"text","value":" At the end of the episode (time step ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"n3aMTx1wCA"},{"type":"inlineMath","value":"H-1","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"H1H-1H1","key":"WMi1SGedIa"},{"type":"text","value":"), we can’t\ntake any more actions, so the ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"s9aFiTstx9"},{"type":"inlineMath","value":"Q","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"QQQ","key":"e6OCGFSKmm"},{"type":"text","value":"-function is simply the reward that\nwe obtain:","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"yBJz9MlXQm"}],"key":"cA0UHpfKOz"},{"type":"math","value":"Q^\\star_{H-1}(s, a) = r(s, a)","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"html":"QH1(s,a)=r(s,a)Q^\\star_{H-1}(s, a) = r(s, a)QH1(s,a)=r(s,a)","enumerator":"1.26","key":"dFrbQP3WX7"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"so the best thing to do\nis just act greedily and get as much reward as we can!","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"G7fcIAdr4w"}],"key":"WoMmfjnGtO"},{"type":"math","value":"\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"html":"πH1(s)=argmaxaQH1(s,a)\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)πH1(s)=argamaxQH1(s,a)","enumerator":"1.27","key":"TLiuhCxTGA"},{"type":"paragraph","position":{"start":{"line":839,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"Then\n","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"wLXCTXqjCl"},{"type":"inlineMath","value":"V^\\star_{H-1}(s)","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"VH1(s)V^\\star_{H-1}(s)VH1(s)","key":"SaonS2baLW"},{"type":"text","value":", the optimal value of state ","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"nkaheQM0xJ"},{"type":"inlineMath","value":"s","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"sss","key":"Bhf6qyUxAn"},{"type":"text","value":" at the end of the\ntrajectory, is simply whatever action gives the most reward.","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"Td2k9vf2Tk"}],"key":"MvPm5I6tQR"},{"type":"math","value":"V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"html":"VH1=maxaQH1(s,a)V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)VH1=amaxQH1(s,a)","enumerator":"1.28","key":"s3k7uissFI"},{"type":"paragraph","position":{"start":{"line":845,"column":1},"end":{"line":847,"column":1}},"children":[{"type":"strong","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"children":[{"type":"text","value":"Recursion.","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"iqe08nn7rj"}],"key":"zjrbT9xtLa"},{"type":"text","value":" Then, we can work backwards in time, starting from the\nend, using our consistency equations! i.e. for each\n","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"Io0iASFrjB"},{"type":"inlineMath","value":"t = H-2, \\dots, 0","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"html":"t=H2,,0t = H-2, \\dots, 0t=H2,,0","key":"AzGAl63lPR"},{"type":"text","value":", we set","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"UKoQMsYWYu"}],"key":"NsNMhAi39w"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}","position":{"start":{"line":849,"column":1},"end":{"line":855,"column":1}},"html":"Qt(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]πt(s)=argmaxaQt(s,a)Vt(s)=maxaQt(s,a)\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}Qt(s,a)πt(s)Vt(s)=r(s,a)+EsP(s,a)[Vh+1(s)]=argamaxQt(s,a)=amaxQt(s,a)","enumerator":"1.29","key":"yxxqg8sPxd"}],"enumerator":"1.11","html_id":"pi-star-dp","key":"U12HzpgMNL"}],"key":"FxnDawhLFV"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def find_optimal_policy(mdp: MDP):\n Q = [None] * mdp.H\n pi = [None] * mdp.H\n V = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n\n for h in range(mdp.H - 1, -1, -1):\n Q[h] = mdp.r + mdp.P @ V[h + 1]\n pi[h] = jnp.eye(mdp.S)[jnp.argmax(Q[h], axis=1)] # one-hot\n V[h] = jnp.max(Q[h], axis=1)\n\n Q = jnp.stack(Q)\n pi = jnp.stack(pi)\n V = jnp.stack(V[:-1])\n\n return pi, V, Q","key":"iXQnFsQA53"},{"type":"output","id":"vmp8o4RL4nfnsW2Y_bCTE","data":[],"key":"XCCQlvSF1r"}],"data":{},"key":"hwBdzUtxtl"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":876,"column":1},"end":{"line":879,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"Dam8t0OEPG"},{"type":"inlineMath","value":"H","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"HHH","key":"xoMqMHW76A"},{"type":"text","value":" timesteps, we must compute ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"fjpTqm9KNp"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"QQ^{\\star}Q","key":"Wf5CKuCN5c"},{"type":"text","value":" for each of\nthe ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"IFhA9DW4ag"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"do3znrEp0L"},{"type":"text","value":" state-action pairs. Each computation takes ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"G8OmzYo5m9"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"S|\\mathcal{S}|S","key":"osSgWSFkwj"},{"type":"text","value":"\noperations to evaluate the average value over ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"F6b2K0JOL6"},{"type":"inlineMath","value":"s'","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"ss's","key":"uwru5u6q24"},{"type":"text","value":". This gives a total\ncomputation time of ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"TiGThIuZmt"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"N9kGuRsbPP"},{"type":"text","value":".","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"W0d7K5ye4Q"}],"key":"WgiU233ENJ"},{"type":"paragraph","position":{"start":{"line":881,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"text","value":"Note that this algorithm is identical to the policy evaluation algorithm\n","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"xMfThqYfnk"},{"type":"crossReference","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"inlineCode","value":"dp_eval_finite","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"ujJ1aFF2K9"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"F8zGuSFVX3"},{"type":"text","value":", but instead of ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"spF9uadPVE"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"averaging","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"dAW1dhTfmP"}],"key":"PWat5K4yH5"},{"type":"text","value":" over the\nactions chosen by a policy, we instead simply take a ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"Zi3vc4WO2T"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"maximum","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"uvzQ6p9cr0"}],"key":"Hk9zIfzVwm"},{"type":"text","value":" over the\naction-values. We’ll see this relationship between ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"wquvE4NosV"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"CMkdaW5liG"}],"key":"jprnrTAUjR"},{"type":"text","value":"\nand ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"BsVzEON4My"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"optimal policy computation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"Gvr8IWjsxG"}],"key":"U6MZd0zPQy"},{"type":"text","value":" show up again in the infinite-horizon\nsetting.","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"nSfqFTwdMM"}],"key":"SBltnavgUP"}],"key":"enIksWAHkg"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)\nassert jnp.allclose(π_opt, tidy_policy_messy_only)\nassert jnp.allclose(V_opt, V_messy)\nassert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])\n\"Assertions passed (the 'tidy when messy' policy is optimal)\"","key":"NbI7OaASOW"},{"type":"output","id":"nzrkWXRLNtsKk4_PtAx3C","data":[{"output_type":"execute_result","execution_count":16,"metadata":{},"data":{"text/plain":{"content":"\"Assertions passed (the 'tidy when messy' policy is optimal)\"","content_type":"text/plain"}}}],"key":"AEi4mHxLon"}],"data":{},"key":"s5xURUn6mL"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"children":[{"type":"text","value":"Infinite-horizon MDPs","position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"key":"J4cAyOJd8J"}],"label":"infinite_horizon_mdps","identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","enumerator":"1.4","key":"dFSAlnE8wQ"},{"type":"paragraph","position":{"start":{"line":899,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"What happens if a trajectory is allowed to continue forever (i.e.\n","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"R5fL3F34lL"},{"type":"inlineMath","value":"H = \\infty","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"html":"H=H = \\inftyH=","key":"KyLthN5sWd"},{"type":"text","value":")? This is the setting of ","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"ilUpiD5vjw"},{"type":"strong","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"children":[{"type":"text","value":"infinite horizon","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"obojbzbduy"}],"key":"eSJ2XunK1y"},{"type":"text","value":" MDPs.","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"I4HedX0Y9P"}],"key":"y78YIkJX5b"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"In this chapter, we’ll describe the necessary adjustments from the\nfinite-horizon case to make the problem tractable. We’ll show that the\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"DD9FTOcF7J"},{"type":"crossReference","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"AbdMDQXLOD"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"DrI0EpI8RE"},{"type":"text","value":" in the discounted reward setting is a\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"x8uUMXLdzh"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"TyfSZzVmMj"}],"key":"O25zaY5iYA"},{"type":"text","value":" for any policy.\nWe’ll discuss how to evaluate\npolicies (i.e. compute their corresponding value functions). Finally,\nwe’ll present and analyze two iterative algorithms, based on the Bellman\noperator, for computing the optimal policy: ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"V628JM7jQX"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"UiJt2eMYc9"}],"key":"bpdQXlWX2j"},{"type":"text","value":" and\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"oc5x5kVRoI"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"v2QyXvWag4"}],"key":"ze4ipDeDPt"},{"type":"text","value":".","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"GEjuwelenI"}],"key":"iRIghoWC2A"},{"type":"heading","depth":3,"position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"children":[{"type":"text","value":"Discounted rewards","position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"key":"b4yhnHlQN2"}],"identifier":"discounted-rewards","label":"Discounted rewards","html_id":"discounted-rewards","implicit":true,"enumerator":"1.4.1","key":"jfSPIgaZC3"},{"type":"paragraph","position":{"start":{"line":914,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"First of all, note that maximizing the cumulative reward\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"Xk0oL2tyPg"},{"type":"inlineMath","value":"r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdots","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"rh+rh+1+rh+2+r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdotsrh+rh+1+rh+2+","key":"uuG6bQNxOm"},{"type":"text","value":" is no longer a good idea since it\nmight blow up to infinity. Instead of a time horizon ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"uDAMHSklbv"},{"type":"inlineMath","value":"H","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"HHH","key":"RooSo740IV"},{"type":"text","value":", we now need a\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"Igtmoo82p3"},{"type":"strong","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"children":[{"type":"text","value":"discount factor","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"rCM50o1eu0"}],"key":"qTHyjDKI8C"},{"type":"text","value":" ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"SC3QABXA6j"},{"type":"inlineMath","value":"\\gamma \\in [0, 1)","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"γ[0,1)\\gamma \\in [0, 1)γ[0,1)","key":"mWi16wCbj2"},{"type":"text","value":" such that rewards become less\nvaluable the further into the future they are:","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"XpmrVludlC"}],"key":"i0vqZpCPNn"},{"type":"math","value":"r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.","position":{"start":{"line":920,"column":1},"end":{"line":920,"column":1}},"html":"rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.rh+γrh+1+γ2rh+2+=k=0γkrh+k.","enumerator":"1.30","key":"kDHyINLQ43"},{"type":"paragraph","position":{"start":{"line":922,"column":1},"end":{"line":924,"column":1}},"children":[{"type":"text","value":"We can think of ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"HmuvoOF6y8"},{"type":"text","value":"γ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"dUNQQQImst"},{"type":"text","value":" as measuring how much we care about the future:\nif it’s close to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"mJcxAzV1xW"},{"type":"text","value":"0","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"IgB4avPgjs"},{"type":"text","value":", we only care about the near-term rewards; it’s\nclose to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"QNCRPtBjNP"},{"type":"text","value":"1","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"xYWqVIF3JB"},{"type":"text","value":", we put more weight into future rewards.","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"i3255wpPMJ"}],"key":"WEplZe2gBw"},{"type":"paragraph","position":{"start":{"line":926,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"You can also analyze ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"TOkgUTL4Vd"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"YWDAdA6hsk"},{"type":"text","value":" as the probability of ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"nmrEYmRBQ7"},{"type":"emphasis","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"children":[{"type":"text","value":"continuing","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"epfClVp70Q"}],"key":"PXreroRwFw"},{"type":"text","value":" the\ntrajectory at each time step. (This is equivalent to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"CTZuGa4ADd"},{"type":"inlineMath","value":"H","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"html":"HHH","key":"C2il03DalA"},{"type":"text","value":" being\ndistributed by a First Success distribution with success probability\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"FkLqBGmMVu"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"O3tCFGdW1X"},{"type":"text","value":".) This accords with the above interpretation: if ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"GhZf2EuJx0"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"a1G7e6UIKp"},{"type":"text","value":" is\nclose to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"KdANNZqwMQ"},{"type":"text","value":"0","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"BHWDrOqhUE"},{"type":"text","value":", the trajectory will likely be very short, while if\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"GTQxd8NwbX"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"ktx7SbeR7t"},{"type":"text","value":" is close to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"n61SZUdioH"},{"type":"text","value":"1","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"dk62JIBLB9"},{"type":"text","value":", the trajectory will likely continue for a long\ntime.","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"WrCClbMVgG"}],"key":"UaBtJE0laF"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"BKnyjBDrRY"}],"key":"cWLd1k4jlz"},{"type":"paragraph","position":{"start":{"line":935,"column":1},"end":{"line":937,"column":1}},"children":[{"type":"text","value":"Assuming that ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"Bci362g3TY"},{"type":"inlineMath","value":"r_\\hi \\in [0, 1]","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"rh[0,1]r_\\hi \\in [0, 1]rh[0,1]","key":"t2vreaDfcp"},{"type":"text","value":" for all ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"uAlfGlYJki"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"rpUIRwPreS"},{"type":"text","value":",\nwhat is the maximum ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"gNiMbP0BUw"},{"type":"strong","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"children":[{"type":"text","value":"discounted","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"lVdHu66VcF"}],"key":"yu6d0Wm2T3"},{"type":"text","value":" cumulative reward? You may find it\nuseful to review geometric series.","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"RswTdyNlKJ"}],"key":"K8YFqRcXch"}],"key":"QjksTZgjFD"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"The other components of the MDP remain the same:","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"MFY1qXvtq0"}],"key":"C0nEDqLUXn"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"html":"M=(S,A,μ,P,r,γ).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).M=(S,A,μ,P,r,γ).","enumerator":"1.31","key":"QoPJafYDxv"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"Code-wise, we can reuse the ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"C1lDrhYB1R"},{"type":"inlineCode","value":"MDP","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"Jw3hzl7l0t"},{"type":"text","value":" class from before ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"pFebwxtns1"},{"type":"crossReference","kind":"proof:definition","identifier":"finite_horizon_mdp","label":"finite_horizon_mdp","children":[{"type":"text","value":"Definition ","key":"MdPaLWyFGM"},{"type":"text","value":"1.2","key":"KabZ3y2o0E"}],"template":"Definition %s","enumerator":"1.2","resolved":true,"html_id":"finite-horizon-mdp","key":"PRoq6gWIDR"},{"type":"text","value":" and set ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"Eh20YV9DaN"},{"type":"inlineCode","value":"mdp.H = float('inf')","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"Vn4SKCAz4J"},{"type":"text","value":".","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"NxVvQ8tGmR"}],"key":"K9xc0KgbQc"}],"key":"cHXYoFyU3G"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp_inf = tidy_mdp._replace(H=float(\"inf\"), γ=0.95)","key":"XhM7ti8tSI"},{"type":"output","id":"ABnR7JTVFcKn94be1I1ZQ","data":[],"key":"xUOd9Th3JD"}],"data":{},"key":"jPeZ5qM2BO"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"children":[{"type":"text","value":"Stationary policies","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"PyfVZQB1kh"}],"identifier":"stationary-policies","label":"Stationary policies","html_id":"stationary-policies","implicit":true,"enumerator":"1.4.2","key":"wAIwfdlzMI"},{"type":"paragraph","position":{"start":{"line":952,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"text","value":"The time-dependent policies from the finite-horizon case become\ndifficult to handle in the infinite-horizon case. In particular, many of\nthe DP approaches we saw required us to start at the end of the\ntrajectory, which is no longer possible. We’ll shift to ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"qRp30jRviM"},{"type":"strong","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"fmvaiW27ep"}],"key":"Hmcli82fXy"},{"type":"text","value":"\npolicies ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"ggThSNHFzg"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"π:SA\\pi : \\mathcal{S} \\to \\mathcal{A}π:SA","key":"QXwHVC6vdh"},{"type":"text","value":" (deterministic) or ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"qZQfEX0j3J"},{"type":"inlineMath","value":"\\Delta(\\mathcal{A})","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"Δ(A)\\Delta(\\mathcal{A})Δ(A)","key":"onh61uiHNZ"},{"type":"text","value":" (stochastic).","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"nrzi0Ul0PW"}],"key":"ZMuL89ucO6"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"WAsrkuGU5e"}],"key":"zL5LG5VPmX"},{"type":"paragraph","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Which of the policies in ","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"irgGyXvid8"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"tJJ5eCCQfc"},{"type":"text","value":"1.2","key":"Ec3xAtPFwG"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"DqZ0u006O9"},{"type":"text","value":" are stationary?","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"ZAKVl1yeFK"}],"key":"txWJpo0joa"}],"key":"YkutFVW6LG"},{"type":"heading","depth":3,"position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"children":[{"type":"text","value":"Value functions and Bellman consistency","position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"key":"AjmI1svLIO"}],"identifier":"value-functions-and-bellman-consistency","label":"Value functions and Bellman consistency","html_id":"value-functions-and-bellman-consistency","implicit":true,"enumerator":"1.4.3","key":"V5TgumyCwc"},{"type":"paragraph","position":{"start":{"line":964,"column":1},"end":{"line":966,"column":1}},"children":[{"type":"text","value":"We also consider stationary value functions ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"bjS3m4pUD1"},{"type":"inlineMath","value":"V^\\pi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Vπ:SRV^\\pi : \\mathcal{S} \\to \\mathbb{R}Vπ:SR","key":"LEwXh5D6H0"},{"type":"text","value":" and\n","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"CRwEjH5Xop"},{"type":"inlineMath","value":"Q^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Qπ:S×ARQ^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qπ:S×AR","key":"qYqS1MVrGY"},{"type":"text","value":". We need to insert a factor of ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"Hh1qP7nvRw"},{"type":"text","value":"γ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"CsoacqE8eR"},{"type":"text","value":"\ninto the Bellman consistency equation ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"SzhQNeVpXl"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"K2AMqaDEW1"},{"type":"text","value":"1.1","key":"ackUDW9EgT"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"cA8zjzcLrg"},{"type":"text","value":" to account for the discounting:","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"kQoOzfVH5Z"}],"key":"e8MNSkiuqH"},{"type":"math","value":"\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}","label":"bellman_consistency_infinite","identifier":"bellman_consistency_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]for any hN=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]for any hN=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}Vπ(s)Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]for any hNfor any hN","enumerator":"1.32","html_id":"bellman-consistency-infinite","key":"jjVsBJc6XA"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"yfnSw18wbK"}],"key":"eASgz07CyC"},{"type":"paragraph","position":{"start":{"line":980,"column":1},"end":{"line":981,"column":1}},"children":[{"type":"text","value":"Heuristically speaking, why does it no longer matter which\ntime step we condition on when defining the value function?","position":{"start":{"line":980,"column":1},"end":{"line":980,"column":1}},"key":"MuwQ3yXDet"}],"key":"vZxeOU8fVz"}],"key":"Ohc5Sl2G8X"},{"type":"heading","depth":2,"position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"children":[{"type":"text","value":"Solving infinite-horizon MDPs","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"key":"SPUKJPBHux"}],"identifier":"solving-infinite-horizon-mdps","label":"Solving infinite-horizon MDPs","html_id":"solving-infinite-horizon-mdps","implicit":true,"enumerator":"1.5","key":"TxehpTlhc2"},{"type":"heading","depth":3,"position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"key":"qYdXSS6Quo"}],"identifier":"the-bellman-operator-is-a-contraction-mapping","label":"The Bellman operator is a contraction mapping","html_id":"the-bellman-operator-is-a-contraction-mapping","implicit":true,"enumerator":"1.5.1","key":"Z4xpe3Zvgf"},{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Recall from ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"h9oU2tzTbu"},{"type":"crossReference","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"children":[{"type":"text","value":"Definition ","key":"EPzrjksnpa"},{"type":"text","value":"1.8","key":"uPVDYaHQaG"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"iZbA4kM9wj"},{"type":"text","value":" that the Bellman operator ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"BpQKhmXVIz"},{"type":"inlineMath","value":"\\mathcal{J}^{\\pi}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"Jπ\\mathcal{J}^{\\pi}Jπ","key":"IBJ7mdP1YT"},{"type":"text","value":"\nfor a policy ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"ijjvaZBf8k"},{"type":"text","value":"π","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"nUzL49lCTF"},{"type":"text","value":" takes in a “value function” ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"nkHqiZstoj"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"lz4xF8DDg0"},{"type":"text","value":" and\nreturns the r.h.s. of the Bellman equation for that “value function”. In\nthe infinite-horizon setting, this is","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"ngar6K04PS"}],"key":"xC79U5XVYC"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].","enumerator":"1.33","key":"jxRCKIJFiP"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"The crucial property of the Bellman operator is that it is a\n","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"kH1WCVhF0g"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"PMXP5gDJ9q"}],"key":"lEjnioveHt"},{"type":"text","value":" for any policy. Intuitively, if we start with\ntwo “value functions” ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"melyMrydtT"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"EbqqUXNo2b"},{"type":"text","value":", if we repeatedly apply the\nBellman operator to each of them, they will get closer and closer\ntogether at an exponential rate.","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"yUmwhnJU1C"}],"key":"Gd9eHD6xPG"},{"type":"proof","kind":"definition","label":"contraction","identifier":"contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contraction mapping","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"RjB15VQtEI"}],"key":"gnk0kpbYyn"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1005,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"ttPP3mleQl"},{"type":"inlineMath","value":"X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"XXX","key":"wIfPSEbQwY"},{"type":"text","value":" be some space with a norm ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"bh2eitkcby"},{"type":"inlineMath","value":"\\|\\cdot\\|","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"\\|\\cdot\\|","key":"WxFFA90uVX"},{"type":"text","value":". We call an operator\n","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"OQQDnPrBso"},{"type":"inlineMath","value":"f: X \\to X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"f:XXf: X \\to Xf:XX","key":"y9Ht8IQVsK"},{"type":"text","value":" a ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"KQ13FXgIyI"},{"type":"strong","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"QluR90p8Qe"}],"key":"VwRuO3v61d"},{"type":"text","value":" if for any ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"FWYpgCJ5iJ"},{"type":"inlineMath","value":"x, y \\in X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"x,yXx, y \\in Xx,yX","key":"FV3C0LAL59"},{"type":"text","value":",","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"mNH3hFMqjR"}],"key":"dDEcVVAfW4"},{"type":"math","value":"\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|","position":{"start":{"line":1007,"column":1},"end":{"line":1007,"column":1}},"html":"f(x)f(y)γxy\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|f(x)f(y)γxy","enumerator":"1.34","key":"ULst5qI1Ez"},{"type":"paragraph","position":{"start":{"line":1009,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"for some fixed ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"EzDn5cbwDz"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"zeguMbDbu5"},{"type":"text","value":".\nIntuitively, this means that if two points are ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"NCjPhkl09s"},{"type":"text","value":"δ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"dFeJ3tRI4c"},{"type":"text","value":" far apart,\nafter applying the mapping,","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"poR2aX2L5Q"}],"key":"sUAcCdKVF9"}],"enumerator":"1.12","html_id":"contraction","key":"jVoD5ZI5g7"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"eBVq6jPkUY"}],"key":"IaPdFoUaPM"},{"type":"paragraph","position":{"start":{"line":1016,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Show that for a contraction mapping ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"W3hBJion86"},{"type":"inlineMath","value":"f","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"fff","key":"WPvRivE0QA"},{"type":"text","value":" with coefficient\n","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"FHAnBfN88W"},{"type":"text","value":"γ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"LKmyvQk5gM"},{"type":"text","value":", for all ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"Qo7NINVxOZ"},{"type":"inlineMath","value":"t \\in \\mathbb{N}","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"tNt \\in \\mathbb{N}tN","key":"uT4a0gH0Pq"},{"type":"text","value":",","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"yrSwSi0bij"}],"key":"qgTFxA6DD7"},{"type":"math","value":"\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"f(t)(x)f(t)(y)γtxy,\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,f(t)(x)f(t)(y)γtxy,","enumerator":"1.35","key":"NDTXU1qyHI"},{"type":"paragraph","position":{"start":{"line":1021,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"i.e. that any\ntwo points will be pushed closer by at least a factor of ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"HtwnoiRclJ"},{"type":"text","value":"γ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"LQbNG0vLL6"},{"type":"text","value":" at\neach iteration.","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"a2udIRjPiS"}],"key":"LWORTpMUXc"}],"key":"rrSCUcBQ82"},{"type":"paragraph","position":{"start":{"line":1026,"column":1},"end":{"line":1029,"column":1}},"children":[{"type":"text","value":"It is a powerful fact (known as the ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"hflcqzebyj"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"Banach fixed-point theorem","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"IU0t9CQVtY"}],"key":"yItYbfd2Tf"},{"type":"text","value":") that\nevery contraction mapping has a unique ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"JXfMmcV94o"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"fixed point","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"CZjs8AfRi9"}],"key":"yrXMf0JXmX"},{"type":"text","value":" ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"rv0TLBWWZq"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"KXKyCRZZLx"},{"type":"text","value":" such\nthat ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"Wd9bqQ5zlh"},{"type":"inlineMath","value":"f(x^\\star) = x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"f(x)=xf(x^\\star) = x^\\starf(x)=x","key":"JVnV8l8mqP"},{"type":"text","value":". This means that if we repeatedly apply ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"Vupil5muKt"},{"type":"inlineMath","value":"f","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"fff","key":"qugNXQ5CRl"},{"type":"text","value":"\nto any starting point, we will eventually converge to ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"LaGNjfNxGG"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"YcGV3eNmCt"},{"type":"text","value":":","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"rga4w4ygCL"}],"key":"pp2m4ts74O"},{"type":"math","value":"\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.","label":"contraction_convergence","identifier":"contraction_convergence","html":"f(t)(x)xγtxx.\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.f(t)(x)xγtxx∥.","enumerator":"1.36","html_id":"contraction-convergence","key":"rLNBk8G1wR"},{"type":"paragraph","position":{"start":{"line":1037,"column":1},"end":{"line":1040,"column":1}},"children":[{"type":"text","value":"Let’s return to the RL setting and apply this result to the Bellman\noperator. How can we measure the distance between two “value functions”\n","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"It99WmTNMn"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"TkxlmKhCrm"},{"type":"text","value":"? We’ll take the ","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"hZ1EIJaWxq"},{"type":"strong","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"children":[{"type":"text","value":"supremum norm","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"a4lJvF3A4p"}],"key":"pYOZo5SZFn"},{"type":"text","value":" as our distance\nmetric:","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"X0LJuHBEWf"}],"key":"VBCvg1ubmT"},{"type":"math","value":"\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,","position":{"start":{"line":1042,"column":1},"end":{"line":1042,"column":1}},"html":"vu:=supsSv(s)u(s),\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,vu:=sSsupv(s)u(s),","enumerator":"1.37","key":"l7sZDB5XjQ"},{"type":"paragraph","position":{"start":{"line":1044,"column":1},"end":{"line":1048,"column":1}},"children":[{"type":"text","value":"i.e.\nwe compare the “value functions” on the state that causes the biggest\ngap between them. Then ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"WfE9QIFcGB"},{"type":"crossReference","kind":"equation","identifier":"contraction_convergence","label":"contraction_convergence","children":[{"type":"text","value":"(","key":"QMcouVHzff"},{"type":"text","value":"1.36","key":"Ioku6XGWUd"},{"type":"text","value":")","key":"Qqgo57JeC1"}],"template":"(%s)","enumerator":"1.36","resolved":true,"html_id":"contraction-convergence","key":"NsW9kchciN"},{"type":"text","value":" implies that if we repeatedly\napply ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"QYVK40aiz8"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"Jπ\\mathcal{J}^\\piJπ","key":"MwAdMraLxb"},{"type":"text","value":" to any starting “value function”, we will eventually\nconverge to ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"P7IONzvFHe"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"VπV^\\piVπ","key":"OLffK7gZRw"},{"type":"text","value":":","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"xu1jzhei4q"}],"key":"Hug6mCmQW5"},{"type":"math","value":"\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.","label":"bellman_convergence","identifier":"bellman_convergence","html":"(Jπ)(t)(v)VπγtvVπ.\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.(Jπ)(t)(v)VπγtvVπ.","enumerator":"1.38","html_id":"bellman-convergence","key":"NznNkowYdv"},{"type":"paragraph","position":{"start":{"line":1056,"column":1},"end":{"line":1057,"column":1}},"children":[{"type":"text","value":"We’ll use this useful fact to prove the convergence of several\nalgorithms later on.","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"e52pxsQIsM"}],"key":"dGd9rUqdJW"},{"type":"proof","kind":"theorem","label":"bellman_contraction","identifier":"bellman_contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":1059,"column":1},"end":{"line":1059,"column":1}},"key":"NSv7mr7XrP"}],"key":"nIJfBH8j7I"},{"type":"math","value":"\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"Jπ(v)Jπ(u)γvu.\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.Jπ(v)Jπ(u)γvu.","enumerator":"1.39","key":"qcBWvTF0Kk"}],"enumerator":"1.4","html_id":"bellman-contraction","key":"BeeJ19liBZ"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof of ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"JvtWHF5uG9"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"EBjMpcD1lS"},{"type":"text","value":"1.4","key":"NWpg0I4t5p"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"PWlWfXOfmM"}],"key":"XsCMPeDsqY"},{"type":"paragraph","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"children":[{"type":"text","value":"For all states ","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"JKGWlWhJdY"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"lkjLWe1qNr"},{"type":"text","value":",","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"VTqOoGzOoX"}],"key":"GLu0992V7d"},{"type":"math","value":"\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1071,"column":1},"end":{"line":1080,"column":1}},"html":"[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γmaxsv(s)u(s)=γvu.\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γsmaxv(s)u(s)=γvu.","enumerator":"1.40","key":"xYTR9nBoDw"}],"enumerator":"1.2","key":"MU3fvXlD2d"},{"type":"heading","depth":3,"position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"children":[{"type":"text","value":"Policy evaluation in infinite-horizon MDPs","position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"key":"dnxbtA81t8"}],"identifier":"policy-evaluation-in-infinite-horizon-mdps","label":"Policy evaluation in infinite-horizon MDPs","html_id":"policy-evaluation-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.2","key":"KFwo3wZcmW"},{"type":"paragraph","position":{"start":{"line":1085,"column":1},"end":{"line":1087,"column":1}},"children":[{"type":"text","value":"The backwards DP technique we used in ","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"PQfWyASUUT"},{"type":"crossReference","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"children":[{"type":"text","value":"the finite-horizon case","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"d4rzQstGP7"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"Op5Lkns3y3"},{"type":"text","value":" no\nlonger works since there is no “final timestep” to start from. We’ll\nneed another approach to policy evaluation.","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"nc78TLcJX7"}],"key":"gVhY5HHHSG"},{"type":"paragraph","position":{"start":{"line":1089,"column":1},"end":{"line":1092,"column":1}},"children":[{"type":"text","value":"The Bellman consistency conditions yield a system of equations we can\nsolve to evaluate a deterministic policy ","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"BF1XNSq8jv"},{"type":"emphasis","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"Z2mzChf3F9"}],"key":"STZdSd05M4"},{"type":"text","value":". For a faster approximate solution,\nwe can iterate the policy’s Bellman operator, since we know that it has\na unique fixed point at the true value function.","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"H5sWnmMOuT"}],"key":"q5KOlKbbgl"},{"type":"heading","depth":4,"position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"children":[{"type":"text","value":"Matrix inversion for deterministic policies","position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"key":"Php3GK8OtZ"}],"identifier":"matrix-inversion-for-deterministic-policies","label":"Matrix inversion for deterministic policies","html_id":"matrix-inversion-for-deterministic-policies","implicit":true,"enumerator":"1.5.2.1","key":"ijCAG1kJZ4"},{"type":"paragraph","position":{"start":{"line":1096,"column":1},"end":{"line":1098,"column":1}},"children":[{"type":"text","value":"Note that when the policy ","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"xfkskZv74Q"},{"type":"text","value":"π","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"Bny5RGP1sN"},{"type":"text","value":" is deterministic, the actions can be\ndetermined from the states, and so we can chop off the action dimension\nfor the rewards and state transitions:","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"k9myQvKI9k"}],"key":"rS7j3XJeBk"},{"type":"math","value":"\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}","position":{"start":{"line":1100,"column":1},"end":{"line":1105,"column":1}},"html":"rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}rππRSASPπVπ[0,1]S×SRSμQπ[0,1]SRS×A.","enumerator":"1.41","key":"YWCGauZK3g"},{"type":"paragraph","position":{"start":{"line":1107,"column":1},"end":{"line":1109,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"h0maZx2gMA"},{"type":"inlineMath","value":"P^\\pi","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"PπP^\\piPπ","key":"C0VWIRputH"},{"type":"text","value":", we’ll treat the rows as the states and the\ncolumns as the next states. Then ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"N8ujuRZ8Tl"},{"type":"inlineMath","value":"P^\\pi_{s, s'}","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"Ps,sπP^\\pi_{s, s'}Ps,sπ","key":"sZi8KRS5o2"},{"type":"text","value":" is the probability of\ntransitioning from state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"D9U9FBHkep"},{"type":"inlineMath","value":"s","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"sss","key":"pCTgKgP4gv"},{"type":"text","value":" to state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"fDZr9DlsWU"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"ss's","key":"VZ97sQLZZb"},{"type":"text","value":" under policy ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"nZXVFhHkV0"},{"type":"text","value":"π","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"yIG4LcaHP3"},{"type":"text","value":".","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"Foj2rjILaM"}],"key":"n1OIvKpZzH"},{"type":"proof","kind":"example","label":"tidy_tabular","identifier":"tidy_tabular","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":1111,"column":1},"end":{"line":1111,"column":1}},"key":"u4HxxsLUSL"}],"key":"y6B0x1lP8l"},{"type":"paragraph","position":{"start":{"line":1114,"column":1},"end":{"line":1116,"column":1}},"children":[{"type":"text","value":"The tabular MDP from before has ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"dW67RGD5XO"},{"type":"inlineMath","value":"|\\mathcal{S}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"S=2|\\mathcal{S}| = 2S=2","key":"Qwp0geDiBn"},{"type":"text","value":" and ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"NHVr3UnnR5"},{"type":"inlineMath","value":"|\\mathcal{A}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"A=2|\\mathcal{A}| = 2A=2","key":"p3txODnaDI"},{"type":"text","value":". Let’s write\ndown the quantities for the policy ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"rPrCikQIUb"},{"type":"text","value":"π","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"UlBQUF5ypU"},{"type":"text","value":" that tidies if and only if the\nroom is messy:","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"fOk3UkzW0c"}],"key":"UhIbJKXsMM"},{"type":"math","value":"r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}","position":{"start":{"line":1118,"column":1},"end":{"line":1120,"column":1}},"html":"rπ=[10],Pπ=[0.70.310],μ=[10]r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}rπ=[10],Pπ=[0.710.30],μ=[10]","enumerator":"1.42","key":"bPpW3WUEF4"},{"type":"paragraph","position":{"start":{"line":1122,"column":1},"end":{"line":1123,"column":1}},"children":[{"type":"text","value":"We’ll see how to\nevaluate this policy in the next section.","position":{"start":{"line":1122,"column":1},"end":{"line":1122,"column":1}},"key":"EvXHesCkl3"}],"key":"kY9dfjH0TN"}],"enumerator":"1.5","html_id":"tidy-tabular","key":"HTvTzOUmtk"},{"type":"paragraph","position":{"start":{"line":1126,"column":1},"end":{"line":1127,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation for a deterministic policy can be\nwritten in tabular notation as","position":{"start":{"line":1126,"column":1},"end":{"line":1126,"column":1}},"key":"cRYLLCha2N"}],"key":"m3yUINLE2t"},{"type":"math","value":"V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.","position":{"start":{"line":1129,"column":1},"end":{"line":1129,"column":1}},"html":"Vπ=rπ+γPπVπ.V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.Vπ=rπ+γPπVπ.","enumerator":"1.43","key":"wNBV52WmbL"},{"type":"paragraph","position":{"start":{"line":1131,"column":1},"end":{"line":1133,"column":1}},"children":[{"type":"text","value":"(Unfortunately, this notation doesn’t simplify the expression for\n","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"VEoBY1zW7H"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"html":"QπQ^\\piQπ","key":"rDHz5TyKCK"},{"type":"text","value":".) This system of equations can be solved with a matrix\ninversion:","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"snQceKLSIn"}],"key":"JRniQJUq3H"},{"type":"math","value":"V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.","label":"matrix_inversion_pe","identifier":"matrix_inversion_pe","html":"Vπ=(IγPπ)1rπ.V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.Vπ=(IγPπ)1rπ.","enumerator":"1.44","html_id":"matrix-inversion-pe","key":"jH3hwVCLic"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"Uapj11uUe1"}],"key":"Rw4z9IX9qs"},{"type":"paragraph","position":{"start":{"line":1142,"column":1},"end":{"line":1143,"column":1}},"children":[{"type":"text","value":"Note we’ve assumed that ","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"U3jWDhS1xp"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"CvgqXszOmz"},{"type":"text","value":" is invertible. Can you see\nwhy this is the case?","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"ZWSOE6YJj3"}],"key":"IepIzEZvyJ"},{"type":"paragraph","position":{"start":{"line":1145,"column":1},"end":{"line":1149,"column":1}},"children":[{"type":"text","value":"(Recall that a linear operator, i.e. a square matrix, is invertible if\nand only if its null space is trivial; that is, it doesn’t map any\nnonzero vector to zero. In this case, we can see that ","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"VHVfRd5uZS"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"OsgIonknuo"},{"type":"text","value":"\nis invertible because it maps any nonzero vector to a vector with at\nleast one nonzero element.)","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"FdzCLC8308"}],"key":"WBc3JLQoB5"}],"key":"ZmatuVHA7O"}],"key":"SG5aD3RA7l"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def eval_deterministic_infinite(\n mdp: MDP, policy: Float[Array, \"S A\"]\n) -> Float[Array, \" S\"]:\n pi = jnp.argmax(policy, axis=1) # un-one-hot\n P_π = mdp.P[jnp.arange(mdp.S), pi]\n r_π = mdp.r[jnp.arange(mdp.S), pi]\n return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)","key":"RhIJgReZSa"},{"type":"output","id":"aFuPmxEEkxUSTQBm3zJNY","data":[],"key":"MrL1fRjK1M"}],"data":{},"key":"dYvjMrfPks"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_eval_infinite","identifier":"tidy_eval_infinite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":1162,"column":1},"end":{"line":1162,"column":1}},"key":"jkzBXkmhqy"}],"key":"iAlbhoNsVj"},{"type":"paragraph","position":{"start":{"line":1165,"column":1},"end":{"line":1166,"column":1}},"children":[{"type":"text","value":"Let’s use the same policy ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"RYgc7AjWeQ"},{"type":"text","value":"π","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"F2Gbz89dvZ"},{"type":"text","value":" that tidies if and only if the room is\nmessy. Setting ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"IiR8KO8ph5"},{"type":"inlineMath","value":"\\gamma = 0.95","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"html":"γ=0.95\\gamma = 0.95γ=0.95","key":"ymflu5bCY3"},{"type":"text","value":", we must invert","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"ZNjCCxtcIP"}],"key":"aSmlUHWY6E"},{"type":"math","value":"I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.","position":{"start":{"line":1168,"column":1},"end":{"line":1168,"column":1}},"html":"IγPπ=[10.95×0.70.95×0.30.95×110.95×0]=[0.3350.2850.951].I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.IγPπ=[10.95×0.70.95×10.95×0.310.95×0]=[0.3350.950.2851].","enumerator":"1.45","key":"Nz581T4yLc"},{"type":"paragraph","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"children":[{"type":"text","value":"The inverse to two decimal points is","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"key":"isGMHcPnaQ"}],"key":"sBxvsim6K5"},{"type":"math","value":"(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.","position":{"start":{"line":1172,"column":1},"end":{"line":1172,"column":1}},"html":"(IγPπ)1=[15.564.4414.795.21].(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.(IγPπ)1=[15.5614.794.445.21].","enumerator":"1.46","key":"eR7vVRJgup"},{"type":"paragraph","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"children":[{"type":"text","value":"Thus the value function is","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"key":"Y22t7xirTp"}],"key":"ZNBgUevBZm"},{"type":"math","value":"V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.","position":{"start":{"line":1176,"column":1},"end":{"line":1176,"column":1}},"html":"Vπ=(IγPπ)1rπ=[15.564.4414.795.21][10]=[15.5614.79].V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.Vπ=(IγPπ)1rπ=[15.5614.794.445.21][10]=[15.5614.79].","enumerator":"1.47","key":"zUfGPdbZk0"},{"type":"paragraph","position":{"start":{"line":1178,"column":1},"end":{"line":1181,"column":1}},"children":[{"type":"text","value":"Let’s sanity-check this result. Since rewards are at most ","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"t1SEyKikFE"},{"type":"text","value":"1","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"RcLOJmQ7Zt"},{"type":"text","value":", the\nmaximum cumulative return of a trajectory is at most\n","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"b1IC1KTWkh"},{"type":"inlineMath","value":"1/(1-\\gamma) = 20","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"html":"1/(1γ)=201/(1-\\gamma) = 201/(1γ)=20","key":"PFMdjLX1CT"},{"type":"text","value":". We see that the value function is indeed slightly\nlower than this.","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"lfXfx7IIPx"}],"key":"Xn1MraN6YP"}],"enumerator":"1.6","html_id":"tidy-eval-infinite","key":"a0XY6KiPEi"}],"key":"pDKiumHKoY"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"FjmQbxfM07"},{"type":"output","id":"Y7BVT5x_v7ec9Erd6Pdrb","data":[{"output_type":"execute_result","execution_count":19,"metadata":{},"data":{"text/plain":{"content":"Array([15.56419, 14.78598], dtype=float32)","content_type":"text/plain"}}}],"key":"ZJRtLnfxAr"}],"data":{},"key":"ItFiOvDePs"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"children":[{"type":"text","value":"Iterative policy evaluation","position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"key":"jD0fPmQydH"}],"label":"iterative_pe","identifier":"iterative_pe","html_id":"iterative-pe","enumerator":"1.5.2.2","key":"mwYqbaHFIP"},{"type":"paragraph","position":{"start":{"line":1191,"column":1},"end":{"line":1194,"column":1}},"children":[{"type":"text","value":"The matrix inversion above takes roughly ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"hrgcZKwP0D"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^3)","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"html":"O(S3)O(|\\mathcal{S}|^3)O(S3)","key":"q8Hj0y7wM9"},{"type":"text","value":" time.\nIt also only works for deterministic policies.\nCan we trade off the requirement of finding the ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"ldVVLNUgzH"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"exact","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"oLYCocBBt2"}],"key":"km7ZUcp2jD"},{"type":"text","value":" value function for a faster\n","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"MNJQDsu3jd"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"WuHBgk3iBZ"}],"key":"aY1SFTkOMr"},{"type":"text","value":" algorithm that will also extend to stochastic policies?","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"rmW56x4vVK"}],"key":"eENjqtVSR1"},{"type":"paragraph","position":{"start":{"line":1196,"column":1},"end":{"line":1199,"column":1}},"children":[{"type":"text","value":"Let’s use the Bellman operator to define an iterative algorithm for\ncomputing the value function. We’ll start with an initial guess\n","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"QRLzF6Lmx2"},{"type":"inlineMath","value":"v^{(0)}","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"v(0)v^{(0)}v(0)","key":"yLk4EEmoHb"},{"type":"text","value":" with elements in ","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"fAd38OWrOv"},{"type":"inlineMath","value":"[0, 1/(1-\\gamma)]","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"[0,1/(1γ)][0, 1/(1-\\gamma)][0,1/(1γ)]","key":"GDWDxGqmvQ"},{"type":"text","value":" and then iterate the\nBellman operator:","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"RcP2LbcASP"}],"key":"HXlZ050Bvo"},{"type":"math","value":"v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),","position":{"start":{"line":1201,"column":1},"end":{"line":1201,"column":1}},"html":"v(t+1)=Jπ(v(t)),v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),v(t+1)=Jπ(v(t)),","enumerator":"1.48","key":"U7hRFW2IRZ"},{"type":"paragraph","position":{"start":{"line":1203,"column":1},"end":{"line":1204,"column":1}},"children":[{"type":"text","value":"i.e. ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"sohZWbKhLP"},{"type":"inlineMath","value":"v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"v(t)=(Jπ)(t)(v(0))v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})v(t)=(Jπ)(t)(v(0))","key":"twrcJXTQIq"},{"type":"text","value":". Note that each iteration\ntakes ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"CqCyFmboXT"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^2)","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"O(S2)O(|\\mathcal{S}|^2)O(S2)","key":"vZPpwhdKIM"},{"type":"text","value":" time for the matrix-vector multiplication.","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"lkk6RK2TBa"}],"key":"xV31wZ7j4Y"}],"key":"toJUHHEGHt"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def supremum_norm(v):\n return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf)\n\n\ndef loop_until_convergence(op, v, ε=1e-6):\n \"\"\"Repeatedly apply op to v until convergence (in supremum norm).\"\"\"\n while True:\n v_new = op(v)\n if supremum_norm(v_new - v) < ε:\n return v_new\n v = v_new\n\n\ndef iterative_evaluation(mdp: MDP, pi: Float[Array, \"S A\"], ε=1e-6) -> Float[Array, \" S\"]:\n op = partial(bellman_operator, mdp, pi)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"mORiAlX5Ai"},{"type":"output","id":"xqnpLMAEtohD_5AtPA8HN","data":[],"key":"w7YbR6qB6S"}],"data":{},"key":"rR84mIZVWb"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"children":[{"type":"text","value":"Then, as we showed in ","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"fj2qGuhwFT"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"Tbf5DqIp3j"},{"type":"text","value":"1.38","key":"c2byBokoAM"},{"type":"text","value":")","key":"JDv5XtTZja"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"bwuVDX3RWT"},{"type":"text","value":", by the Banach fixed-point theorem:","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"v2wqlQzgEM"}],"key":"RmfcISaQFD"},{"type":"math","value":"\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.","position":{"start":{"line":1227,"column":1},"end":{"line":1227,"column":1}},"html":"v(t)Vπγtv(0)Vπ.\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.v(t)Vπγtv(0)Vπ.","enumerator":"1.49","key":"FurnmpdV0P"}],"key":"gAeJTfVuxo"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"yEk1SXAEej"},{"type":"output","id":"xo6NsNJoIkPGKwirqeYWR","data":[{"output_type":"execute_result","execution_count":21,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"n9YuWI91yX"}],"data":{},"key":"zKntdezpGJ"},{"type":"block","children":[{"type":"proof","kind":"remark","label":"iterations_vi","identifier":"iterations_vi","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Convergence of iterative policy evaluation","position":{"start":{"line":1233,"column":1},"end":{"line":1233,"column":1}},"key":"Bn2KzJFZ9f"}],"key":"zARJ8BUMR1"},{"type":"paragraph","position":{"start":{"line":1236,"column":1},"end":{"line":1237,"column":1}},"children":[{"type":"text","value":"How many iterations do we need for an ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"iq9L0bdivl"},{"type":"text","value":"ε","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"LTUT8w8Pez"},{"type":"text","value":"-accurate estimate? We\ncan work backwards to solve for ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"t7NAujTe3b"},{"type":"inlineMath","value":"t","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"html":"ttt","key":"sQYZYEtaTq"},{"type":"text","value":":","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"yxfYejWAOk"}],"key":"Kqc2fVFp2e"},{"type":"math","value":"\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}","position":{"start":{"line":1239,"column":1},"end":{"line":1245,"column":1}},"html":"γtv(0)Vπϵtlog(ϵ/v(0)Vπ)logγ=log(v(0)Vπ/ϵ)log(1/γ),\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}γtv(0)Vπtϵlogγlog(ϵ/∥v(0)Vπ)=log(1/γ)log(v(0)Vπ/ϵ),","enumerator":"1.50","key":"Tcg2rkGXub"},{"type":"paragraph","position":{"start":{"line":1247,"column":1},"end":{"line":1248,"column":1}},"children":[{"type":"text","value":"and so the number of iterations required for an\n","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"Y15t8rWZkv"},{"type":"text","value":"ε","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"RPtQ7kU8fU"},{"type":"text","value":"-accurate estimate is","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"JiyPCOB5Me"}],"key":"OL6wnhBMSF"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1250,"column":1},"end":{"line":1252,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.51","key":"CfyMqJWDFP"},{"type":"paragraph","position":{"start":{"line":1254,"column":1},"end":{"line":1256,"column":1}},"children":[{"type":"text","value":"Note that we’ve applied the inequalities\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"ZPzikx5gz5"},{"type":"inlineMath","value":"\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"v(0)Vπ1/(1γ)\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)v(0)Vπ1/(1γ)","key":"JmXOlKRx8b"},{"type":"text","value":" and\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"VG0mgsUNZL"},{"type":"inlineMath","value":"\\log (1/x) \\ge 1-x","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"log(1/x)1x\\log (1/x) \\ge 1-xlog(1/x)1x","key":"ZiCifhBpk8"},{"type":"text","value":".","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"WOfV9pvkJh"}],"key":"BLd5YXc6e4"}],"enumerator":"1.2","html_id":"iterations-vi","key":"b433r34qNh"},{"type":"heading","depth":3,"position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"children":[{"type":"text","value":"Optimal policies in infinite-horizon MDPs","position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"key":"TQwafS7ydm"}],"identifier":"optimal-policies-in-infinite-horizon-mdps","label":"Optimal policies in infinite-horizon MDPs","html_id":"optimal-policies-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.3","key":"AiseTXo51g"},{"type":"paragraph","position":{"start":{"line":1261,"column":1},"end":{"line":1266,"column":1}},"children":[{"type":"text","value":"Now let’s move on to solving for an optimal policy in the\ninfinite-horizon case. As in ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"gvx3oxTEpr"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_policy_finite","label":"optimal_policy_finite","children":[{"type":"text","value":"the finite-horizon case","key":"eIQfUxdJBA"}],"template":"Definition %s","enumerator":"1.10","resolved":true,"html_id":"optimal-policy-finite","key":"u411KdzwLK"},{"type":"text","value":", an ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"ITn0C0OXPz"},{"type":"strong","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"zf5Jo7SuOR"}],"key":"Uifa9FxL1T"},{"type":"text","value":" ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"ZI457F88XA"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"π\\pi^\\starπ","key":"sbjfXeADYU"},{"type":"text","value":"\nis one that does at least as well as any other policy in all situations.\nThat is, for all policies ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"vWX0dh6iqp"},{"type":"text","value":"π","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"LvJa0zzYel"},{"type":"text","value":", states ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"cBXwk03qG2"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"mSBJeQ8IFs"},{"type":"text","value":", times\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"PUmy4hMLSp"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"sSnWUMyQI7"},{"type":"text","value":", and initial trajectories\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"XxXVmu5wFC"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"lL7bYbt7at"},{"type":"text","value":" where ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"bf46t43gVS"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"mLh1LQqB4E"},{"type":"text","value":",","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"phLLtXG6cW"}],"key":"quHHP4nSxX"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}","label":"optimal_policy_infinite","identifier":"optimal_policy_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]","enumerator":"1.52","html_id":"optimal-policy-infinite","key":"uZnc8GsEPd"},{"type":"paragraph","position":{"start":{"line":1278,"column":1},"end":{"line":1279,"column":1}},"children":[{"type":"text","value":"Once again, all optimal policies share the same ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"rHHiistKze"},{"type":"strong","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"fYckBHpVMJ"}],"key":"rYfXu0YXfI"},{"type":"text","value":" ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"j7Fd8wd19P"},{"type":"inlineMath","value":"V^\\star","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"html":"VV^\\starV","key":"iJ9jeZJFfC"},{"type":"text","value":", and the greedy policy with respect to this value function\nis optimal.","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"OANAUsy4yG"}],"key":"X7z6lSjFHb"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"QasUlMzpU8"}],"key":"xAYvzCxmel"},{"type":"paragraph","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"children":[{"type":"text","value":"Verify this by modifying the proof ","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"M8DtBq7V4L"},{"type":"crossReference","kind":"proof:theorem","identifier":"optimal_greedy","label":"optimal_greedy","children":[{"type":"text","value":"Theorem ","key":"Elg6XV13C6"},{"type":"text","value":"1.3","key":"RdXqVHeBUT"}],"template":"Theorem %s","enumerator":"1.3","resolved":true,"html_id":"optimal-greedy","key":"ZR2XoyHAq0"},{"type":"text","value":" from the finite-horizon case.","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"JIeEPhlk0K"}],"key":"wiG7Uo6uCB"}],"key":"TsXlV7Ne0n"},{"type":"paragraph","position":{"start":{"line":1285,"column":1},"end":{"line":1289,"column":1}},"children":[{"type":"text","value":"So how can we compute such an optimal policy? We can’t use the backwards\nDP approach from the finite-horizon case ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"FC0DJHOdyA"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"Definition ","key":"UEwzNB4QOZ"},{"type":"text","value":"1.11","key":"hV7NANmgwl"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","key":"c7gPktYA3y"},{"type":"text","value":" since there’s no “final timestep” to start\nfrom. Instead, we’ll exploit the fact that the Bellman consistency\nequation ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"FcXt4aCsQI"},{"type":"crossReference","kind":"equation","identifier":"bellman_consistency_infinite","label":"bellman_consistency_infinite","children":[{"type":"text","value":"(","key":"jv5qnDbKrM"},{"type":"text","value":"1.32","key":"afYSC74Ayx"},{"type":"text","value":")","key":"xgsDtfsG7Q"}],"template":"(%s)","enumerator":"1.32","resolved":true,"html_id":"bellman-consistency-infinite","key":"KAgpa3uBy0"},{"type":"text","value":" for the optimal value\nfunction doesn’t depend on any policy:","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"lPIP0UIvKh"}],"key":"FmS8pOWXJF"},{"type":"math","value":"V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]","label":"bellman_optimality","identifier":"bellman_optimality","html":"V(s)=maxa[r(s,a)+γEsP(s,a)V(s).]V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]V(s)=amax[r(s,a)+γEsP(s,a)V(s).]","enumerator":"1.53","html_id":"bellman-optimality","key":"OPQctzd2PX"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"DmX6tTjq9u"}],"key":"IYxyYQt0Sk"},{"type":"paragraph","position":{"start":{"line":1298,"column":1},"end":{"line":1299,"column":1}},"children":[{"type":"text","value":"Verify this by substituting the greedy policy into the\nBellman consistency equation.","position":{"start":{"line":1298,"column":1},"end":{"line":1298,"column":1}},"key":"I3LecPwXOb"}],"key":"aYksWOYopj"}],"key":"MKY2xhS8JD"},{"type":"paragraph","position":{"start":{"line":1302,"column":1},"end":{"line":1303,"column":1}},"children":[{"type":"text","value":"As before, thinking of the r.h.s. of ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"Jic5PBmn36"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality","label":"bellman_optimality","children":[{"type":"text","value":"(","key":"esLMkdGILN"},{"type":"text","value":"1.53","key":"fK4QtiuQcs"},{"type":"text","value":")","key":"JmHzOH78kv"}],"template":"(%s)","enumerator":"1.53","resolved":true,"html_id":"bellman-optimality","key":"TpGxgz5Ci7"},{"type":"text","value":" as an operator on value functions\ngives the ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"tj0gsAJLih"},{"type":"strong","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"children":[{"type":"text","value":"Bellman optimality operator","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"HgHodGQLNR"}],"key":"zoGE1Wn7i0"}],"key":"pgyOiltgJX"},{"type":"math","value":"[\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right]","label":"bellman_optimality_operator","identifier":"bellman_optimality_operator","html":"[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right][J(v)](s)=amax[r(s,a)+γEsP(s,a)v(s)]","enumerator":"1.54","html_id":"bellman-optimality-operator","key":"AuFIiNgpD6"}],"key":"SZcNUlFvkl"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_optimality_operator(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \" S\"]:\n return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)\n\n\ndef check_optimal(v: Float[Array, \" S\"], mdp: MDP):\n return jnp.allclose(v, bellman_optimality_operator(v, mdp))","key":"BpJWE1VkGJ"},{"type":"output","id":"Mi8-754YECk71g3a3dI4h","data":[],"key":"sRIXZQ7tGW"}],"data":{},"key":"QrExW4Hfei"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"children":[{"type":"text","value":"Value iteration","position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"key":"PNUyNARGCl"}],"label":"value_iteration","identifier":"value_iteration","html_id":"value-iteration","enumerator":"1.5.3.1","key":"tZcUeBge43"},{"type":"paragraph","position":{"start":{"line":1323,"column":1},"end":{"line":1326,"column":1}},"children":[{"type":"text","value":"Since the optimal policy is still a policy, our result that the Bellman\noperator is a contracting map still holds, and so we can repeatedly\napply this operator to converge to the optimal value function! This\nalgorithm is known as ","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"Cn4PfBvmyD"},{"type":"strong","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"HjoNmFnZzV"}],"key":"XArHJ5iIVi"},{"type":"text","value":".","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"GkYcDqpXgk"}],"key":"adG17cTUTf"}],"key":"XhdVUuwj4U"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, \" S\"]:\n \"\"\"Iterate the Bellman optimality operator until convergence.\"\"\"\n op = partial(bellman_optimality_operator, mdp)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"asd0Az9Ytb"},{"type":"output","id":"kLhDvCUty77pWCPjogzCv","data":[],"key":"Uvwiqnvvga"}],"data":{},"key":"aa9G97oANo"},{"type":"block","children":[],"key":"hi3sQ9dZkR"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"value_iteration(tidy_mdp_inf)","key":"PYFYcq7M3q"},{"type":"output","id":"aMn2Eww8z0dbNgKL5O8iS","data":[{"output_type":"execute_result","execution_count":24,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"l0iKFqYvfE"}],"data":{},"key":"TnfQHNztZJ"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1339,"column":1},"end":{"line":1342,"column":1}},"children":[{"type":"text","value":"Note that the runtime analysis for an ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"GKFzO4IKPY"},{"type":"text","value":"ε","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"PABuDvY0dk"},{"type":"text","value":"-optimal value function\nis exactly the same as ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"TzSR2P44ev"},{"type":"crossReference","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"iterative policy evaluation","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"now1IW7WQT"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","key":"mQOgBAw9Fq"},{"type":"text","value":"! This is because value iteration is simply\nthe special case of applying iterative policy evaluation to the\n","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"uReYew1a2z"},{"type":"emphasis","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"ErAk3tToPx"}],"key":"Z8wGfcwelV"},{"type":"text","value":" value function.","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"KaH7hkxUiu"}],"key":"TakuZB5NZM"},{"type":"paragraph","position":{"start":{"line":1344,"column":1},"end":{"line":1346,"column":1}},"children":[{"type":"text","value":"As the final step of the algorithm, to return an actual policy\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"HtYoBP8lzz"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"π^\\hat \\piπ^","key":"TOyiH1p0Et"},{"type":"text","value":", we can simply act greedily with respect to the final iteration\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"riKwjpahp2"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"vZ0Lc8YFyF"},{"type":"text","value":" of our above algorithm:","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"QMOjNw5tEm"}],"key":"Ya2evA9ViX"},{"type":"math","value":"\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].","position":{"start":{"line":1348,"column":1},"end":{"line":1348,"column":1}},"html":"π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].π^(s)=argamax[r(s,a)+γEsP(s,a)v(T)(s)].","enumerator":"1.55","key":"hUbH4VPeDg"},{"type":"paragraph","position":{"start":{"line":1350,"column":1},"end":{"line":1352,"column":1}},"children":[{"type":"text","value":"We must be careful, though: the value function of this greedy policy,\n","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"dsUnXwIeGh"},{"type":"inlineMath","value":"V^{\\hat \\pi}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"Vπ^V^{\\hat \\pi}Vπ^","key":"wzQwuvikFl"},{"type":"text","value":", is ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"vWtu95wOmD"},{"type":"emphasis","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"aOgByk1qRZ"}],"key":"ibThMykL10"},{"type":"text","value":" the same as ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"YP0WRPu8at"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"xqbJyBF9vR"},{"type":"text","value":", which need not even be a\nwell-defined value function for some policy!","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"b5RRbS1qA5"}],"key":"pVlHrz57xv"},{"type":"paragraph","position":{"start":{"line":1354,"column":1},"end":{"line":1358,"column":1}},"children":[{"type":"text","value":"The bound on the policy’s quality is actually quite loose: if\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"uCTQBjn7jJ"},{"type":"inlineMath","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"v(T)Vϵ\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilonv(T)Vϵ","key":"I443g1PwxH"},{"type":"text","value":", then the greedy policy\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"O3VM54DYhb"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"π^\\hat \\piπ^","key":"FwWLZV829I"},{"type":"text","value":" satisfies\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"DpKXOg7oJl"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"Vπ^V2γ1γϵ\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilonVπ^V1γ2γϵ","key":"NysGxShVLA"},{"type":"text","value":",\nwhich might potentially be very large.","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"rykZX1jVQm"}],"key":"sqcJLZNNjl"},{"type":"proof","kind":"theorem","label":"greedy_worsen","identifier":"greedy_worsen","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Greedy policy value worsening","position":{"start":{"line":1360,"column":1},"end":{"line":1360,"column":1}},"key":"Fqmprcqg05"}],"key":"Vzna3A9qRi"},{"type":"math","value":"\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}","position":{"start":{"line":1363,"column":1},"end":{"line":1363,"column":1}},"html":"Vπ^V2γ1γvV\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}Vπ^V1γ2γvV","enumerator":"1.56","key":"PJrHggcVdt"},{"type":"paragraph","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"zxaZSDqOap"},{"type":"inlineMath","value":"\\hat \\pi(s) = \\arg\\max_a q(s, a)","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"html":"π^(s)=argmaxaq(s,a)\\hat \\pi(s) = \\arg\\max_a q(s, a)π^(s)=argmaxaq(s,a)","key":"hK5azgMbaz"},{"type":"text","value":" is the greedy policy with respect to","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"WRtnNwkGb0"}],"key":"KFqxLalH0q"},{"type":"math","value":"q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').","position":{"start":{"line":1367,"column":1},"end":{"line":1367,"column":1}},"html":"q(s,a)=r(s,a)+EsP(s,a)v(s).q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').q(s,a)=r(s,a)+EsP(s,a)v(s).","enumerator":"1.57","key":"aHqJzxOwOV"}],"enumerator":"1.5","html_id":"greedy-worsen","key":"trX4dQvgVs"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":1370,"column":1},"end":{"line":1370,"column":1}},"key":"hgtglqf5Ep"}],"key":"lNsC5NExx3"},{"type":"paragraph","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"children":[{"type":"text","value":"We first have","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"key":"MYyrPZICBx"}],"key":"wn1Tt3DS4q"},{"type":"math","value":"\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}","position":{"start":{"line":1373,"column":1},"end":{"line":1378,"column":1}},"html":"V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].","enumerator":"1.58","key":"covXVncLp5"},{"type":"paragraph","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"children":[{"type":"text","value":"Let’s bound these two quantities separately.","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"key":"R4VA07hlE8"}],"key":"i2Hni7m9mS"},{"type":"paragraph","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"children":[{"type":"text","value":"For the first quantity, note that by the definition of ","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"wTF1M8zvNJ"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"html":"π^\\hat \\piπ^","key":"ontWZlRcgy"},{"type":"text","value":", we have","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"YJAxQT8Yep"}],"key":"w0ObfRvFQm"},{"type":"math","value":"q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).","position":{"start":{"line":1384,"column":1},"end":{"line":1384,"column":1}},"html":"q(s,π^(s))q(s,π(s)).q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).q(s,π^(s))q(s,π(s)).","enumerator":"1.59","key":"IdVA3Dgctp"},{"type":"paragraph","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"children":[{"type":"text","value":"Let’s add ","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"k85JeqhLBP"},{"type":"inlineMath","value":"q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"html":"q(s,π^(s))q(s,π(s))0q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0q(s,π^(s))q(s,π(s))0","key":"SfjIzMCuuo"},{"type":"text","value":" to the first term to get","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"bsomjqhStJ"}],"key":"RyebvU8lmt"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1388,"column":1},"end":{"line":1394,"column":1}},"html":"Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.","enumerator":"1.60","key":"PgiGuos4qf"},{"type":"paragraph","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"children":[{"type":"text","value":"The second quantity is bounded by","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"key":"PoZAnl97Mn"}],"key":"ebLrGH3JDL"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}","position":{"start":{"line":1399,"column":1},"end":{"line":1407,"column":1}},"html":"Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^","enumerator":"1.61","key":"QsRcG2AQS6"},{"type":"paragraph","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"children":[{"type":"text","value":"and thus","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"key":"G3Pno3k4CK"}],"key":"OdQiyetXLA"},{"type":"math","value":"\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}","position":{"start":{"line":1411,"column":1},"end":{"line":1416,"column":1}},"html":"VVπ^2γvV+γVVπ^VVπ^2γvV1γ.\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}VVπ^VVπ^2γvV+γVVπ^1γ2γvV.","enumerator":"1.62","key":"oOUrZH5LdP"}],"enumerator":"1.3","key":"Wco1SsBgnV"},{"type":"paragraph","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"children":[{"type":"text","value":"So in order to compensate and achieve ","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"zXcJ5OBcS7"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilon","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"html":"Vπ^Vϵ\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilonVπ^Vϵ","key":"gv1qxl18r4"},{"type":"text","value":", we must have","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"Lb7LwmPw8p"}],"key":"T3DeGZ2zWh"},{"type":"math","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.","position":{"start":{"line":1421,"column":1},"end":{"line":1421,"column":1}},"html":"v(T)V1γ2γϵ.\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.v(T)V2γ1γϵ.","enumerator":"1.63","key":"GPoBBeUpeO"},{"type":"paragraph","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"children":[{"type":"text","value":"This means, using ","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"o5zv3RCkvE"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"EO5lahx81B"},{"type":"text","value":"1.2","key":"IW15FdwdKh"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"OgM4fMZFnq"},{"type":"text","value":", we need to run value iteration for","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"IyPghofFvH"}],"key":"UvDRJhvhOw"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)","position":{"start":{"line":1425,"column":1},"end":{"line":1425,"column":1}},"html":"T=O(11γlog(γϵ(1γ)2))T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)T=O(1γ1log(ϵ(1γ)2γ))","enumerator":"1.64","key":"iqwGYohLnX"},{"type":"paragraph","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"children":[{"type":"text","value":"iterations to achieve an ","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"mJLuQ9K9gN"},{"type":"text","value":"ε","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"SUZSVAp6VA"},{"type":"text","value":"-accurate estimate of the optimal value function.","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"IRM80WyzmY"}],"key":"M3pPjH0SXN"},{"type":"heading","depth":4,"position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"children":[{"type":"text","value":"Policy iteration","position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"key":"KGtoEcDCeR"}],"label":"policy_iteration","identifier":"policy_iteration","html_id":"policy-iteration","enumerator":"1.5.3.2","key":"aX2zdd7jUI"},{"type":"paragraph","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"hTa5MdAqdd"},{"type":"emphasis","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"together","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"d5aEC7HhEI"}],"key":"vtozKmG1W8"},{"type":"text","value":"? This is the idea behind ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"d1FsiWs6bR"},{"type":"strong","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"LDvPSQb7O9"}],"key":"QpnK9sA78u"},{"type":"text","value":". In each step, we simply set the policy to act greedily with respect to its own value function.","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"VF9Q0cwsUo"}],"key":"mRpVSsFvIV"}],"key":"cLZHc2vvpQ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, \"S A\"]:\n \"\"\"Iteratively improve the policy and value function.\"\"\"\n def op(pi):\n return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))\n π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy\n return loop_until_convergence(op, π_init, ε)","key":"U5w0yChy1P"},{"type":"output","id":"St11RGR1loExkjfV71uMv","data":[],"key":"RHsftTU9Qz"}],"data":{},"key":"SWnMCRAVH4"},{"type":"block","children":[],"key":"m8JwfgWRMK"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"policy_iteration(tidy_mdp_inf)","key":"v28nHj8V84"},{"type":"output","id":"eK9lUbmE0ah5SSjs42LSd","data":[{"output_type":"execute_result","execution_count":26,"metadata":{},"data":{"text/plain":{"content":"Array([[1., 0.],\n [0., 1.]], dtype=float32)","content_type":"text/plain"}}}],"key":"iItvbt1bpK"}],"data":{},"key":"jBJ5g3ZkOI"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"children":[{"type":"text","value":"Although PI appears more complex than VI, we’ll use the same contraction property ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"ywwQ9oBdPH"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"EMXt4hG821"},{"type":"text","value":"1.4","key":"lxSDrewGTd"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"ctNmxnTORC"},{"type":"text","value":" to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"sTwvlbhcVh"},{"type":"text","value":"ε","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"VU412Robhz"},{"type":"text","value":"-optimal value function ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"wVZKgKMmKk"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"aXWhqcx4QF"},{"type":"text","value":"1.2","key":"yCXR2keE6y"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"Rc2SBasXUW"},{"type":"text","value":", although in practice, PI often converges much faster.","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"v8pu54wSyq"}],"key":"HKdCr0n5tP"},{"type":"proof","kind":"theorem","label":"pi_iter_analysis","identifier":"pi_iter_analysis","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy Iteration runtime and convergence","position":{"start":{"line":1450,"column":1},"end":{"line":1450,"column":1}},"key":"XJ0aSP6ccg"}],"key":"fYKJsOol9C"},{"type":"paragraph","position":{"start":{"line":1453,"column":1},"end":{"line":1454,"column":1}},"children":[{"type":"text","value":"We aim to show that the number of iterations required for an\n","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"VtPYW6jDtb"},{"type":"text","value":"ε","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"UNuEhhGH8p"},{"type":"text","value":"-accurate estimate of the optimal value function is","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"DYZEYrdAAd"}],"key":"RSXFxpifB4"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1456,"column":1},"end":{"line":1456,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.65","key":"rQflz75ZlQ"},{"type":"paragraph","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"children":[{"type":"text","value":"This bound follows from the contraction property ","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"CNuvrfLNDn"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"gW0LXMOcta"},{"type":"text","value":"1.38","key":"oz5dX9Vi1i"},{"type":"text","value":")","key":"rO4RadtYOZ"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"WIdms8Skqt"},{"type":"text","value":":","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"QWsOYOWQPO"}],"key":"zBKgeClket"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1460,"column":1},"end":{"line":1460,"column":1}},"html":"Vπt+1VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VγVπtV.","enumerator":"1.66","key":"ldSxJnUxPi"},{"type":"paragraph","position":{"start":{"line":1462,"column":1},"end":{"line":1463,"column":1}},"children":[{"type":"text","value":"We’ll prove that the iterates of PI respect the contraction property by\nshowing that the policies improve monotonically:","position":{"start":{"line":1462,"column":1},"end":{"line":1462,"column":1}},"key":"VCYivHIcRk"}],"key":"cw7bt7nKIB"},{"type":"math","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).","position":{"start":{"line":1465,"column":1},"end":{"line":1465,"column":1}},"html":"Vπt+1(s)Vπt(s).V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).Vπt+1(s)Vπt(s).","enumerator":"1.67","key":"IGoSL2aOCm"},{"type":"paragraph","position":{"start":{"line":1467,"column":1},"end":{"line":1468,"column":1}},"children":[{"type":"text","value":"Then we’ll use this to show\n","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"AJzqE00DBQ"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"html":"Vπt+1(s)[J(Vπt)](s)V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)Vπt+1(s)[J(Vπt)](s)","key":"CI35A4cyyZ"},{"type":"text","value":". Note that","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"n12S14IoPB"}],"key":"sc6ZXOS9Pz"},{"type":"math","value":"\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}","position":{"start":{"line":1470,"column":1},"end":{"line":1475,"column":1}},"html":"(s)=maxa[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}(s)=amax[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)","enumerator":"1.68","key":"E0ZzDFyBAN"},{"type":"paragraph","position":{"start":{"line":1477,"column":1},"end":{"line":1478,"column":1}},"children":[{"type":"text","value":"Since\n","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"NRz69arGvN"},{"type":"inlineMath","value":"[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"html":"[J(Vπt)](s)Vπt(s)[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)[J(Vπt)](s)Vπt(s)","key":"zT4kgNWrT4"},{"type":"text","value":", we then have","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"bD2e8uqZn8"}],"key":"GmVd5G83Q3"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}","label":"pi_iter_proof","identifier":"pi_iter_proof","html":"Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].","enumerator":"1.69","html_id":"pi-iter-proof","key":"txyklh9jsH"},{"type":"paragraph","position":{"start":{"line":1489,"column":1},"end":{"line":1492,"column":1}},"children":[{"type":"text","value":"But note that the\nexpression being averaged is the same as the expression on the l.h.s.\nwith ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"sh85fT19qz"},{"type":"inlineMath","value":"s","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"sss","key":"GXiNiUNQ75"},{"type":"text","value":" replaced by ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"P8H9sYZLfN"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"ss's","key":"OnRjsg3fyS"},{"type":"text","value":". So we can apply the same inequality\nrecursively to get","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"uweWV7u3S7"}],"key":"MSrYwBRcD2"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}","position":{"start":{"line":1494,"column":1},"end":{"line":1500,"column":1}},"html":"Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))sP(s,πt+1(s))[Vπt+1(s)Vπt(s)]\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))s′′P(s,πt+1(s))[Vπt+1(s′′)Vπt(s′′)]","enumerator":"1.70","key":"NbBHNtJZBk"},{"type":"paragraph","position":{"start":{"line":1502,"column":1},"end":{"line":1506,"column":1}},"children":[{"type":"text","value":"which implies that ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"SJnc8guwQR"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"Vπt+1(s)Vπt(s)V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)Vπt+1(s)Vπt(s)","key":"CPKTPRfURS"},{"type":"text","value":"\nfor all ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"cjzT7LgWHn"},{"type":"inlineMath","value":"s","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"sss","key":"IgYYXLQfU8"},{"type":"text","value":" (since the r.h.s. converges to zero). We can then plug this\nback into\n","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"XMA7Jzz51Z"},{"type":"crossReference","kind":"equation","identifier":"pi_iter_proof","label":"pi_iter_proof","children":[{"type":"text","value":"(","key":"Cx8GY9eCZg"},{"type":"text","value":"1.69","key":"SYVxHoIZ2G"},{"type":"text","value":")","key":"B8ec0zPxfC"}],"template":"(%s)","enumerator":"1.69","resolved":true,"html_id":"pi-iter-proof","key":"GP1CF9PawE"},{"type":"text","value":"\nto get the desired result:","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"dsBREOWh4o"}],"key":"oO332kZ4jV"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}","position":{"start":{"line":1508,"column":1},"end":{"line":1514,"column":1}},"html":"Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0Vπt+1(s)[J(Vπt)](s)\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}Vπt+1(s)J(Vπt)(s)Vπt+1(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0[J(Vπt)](s)","enumerator":"1.71","key":"CgLA0cZEbG"},{"type":"paragraph","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"children":[{"type":"text","value":"This means we can now apply the Bellman convergence result ","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"a28uC4GIdg"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"Km3O2YIrl9"},{"type":"text","value":"1.38","key":"PUDAsgfrSn"},{"type":"text","value":")","key":"XopJ4dRBhh"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"aNYOfLP8PR"},{"type":"text","value":" to get","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"Zwur4MKI29"}],"key":"vEBsuoyEWl"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1518,"column":1},"end":{"line":1518,"column":1}},"html":"Vπt+1VJ(Vπt)VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VJ(Vπt)VγVπtV.","enumerator":"1.72","key":"b7ar5TKl2b"}],"enumerator":"1.6","html_id":"pi-iter-analysis","key":"Ht5dkWWe9O"},{"type":"heading","depth":2,"position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"key":"T2fo5s2Ei3"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"1.6","key":"un7UxXRA7m"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":1523,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1523,"column":1},"end":{"line":1530,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1523,"column":1},"end":{"line":1529,"column":1}},"children":[{"type":"text","value":"Markov decision processes (MDPs) are a framework for sequential\ndecision making under uncertainty. They consist of a state space\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"ZZmmcnepko"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"S\\mathcal{S}S","key":"Jo3OtWs4fO"},{"type":"text","value":", an action space ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"eYoS545kRq"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"A\\mathcal{A}A","key":"YBLU3sufOR"},{"type":"text","value":", an initial state distribution\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"iYDb0XSqXi"},{"type":"inlineMath","value":"\\mu \\in \\Delta(\\mathcal{S})","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"μΔ(S)\\mu \\in \\Delta(\\mathcal{S})μΔ(S)","key":"lWZct7kAKu"},{"type":"text","value":", a transition function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"ZRCi1G2oLD"},{"type":"inlineMath","value":"P(s' \\mid s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"P(ss,a)P(s' \\mid s, a)P(ss,a)","key":"hzsS70qqQu"},{"type":"text","value":", and a\nreward function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"d8EUwRpS1D"},{"type":"inlineMath","value":"r(s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"r(s,a)r(s, a)r(s,a)","key":"eSYUZaH5El"},{"type":"text","value":". They can be finite-horizon (ends after\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"PeisE8yu8x"},{"type":"inlineMath","value":"H","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"HHH","key":"A7UBPq1RBY"},{"type":"text","value":" timesteps) or infinite-horizon (where rewards scale by\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"KALvBV5Lw4"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"ZSAZMxINAQ"},{"type":"text","value":" at each timestep).","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"mWHEtVogQb"}],"key":"oHQNu0cgrs"}],"key":"ykv5mko9ZE"},{"type":"listItem","spread":true,"position":{"start":{"line":1531,"column":1},"end":{"line":1535,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1531,"column":1},"end":{"line":1534,"column":1}},"children":[{"type":"text","value":"Our goal is to find a policy ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"pUevd8k6Bx"},{"type":"text","value":"π","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"Zmk7rqN7kl"},{"type":"text","value":" that maximizes expected total\nreward. Policies can be ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"rDtZcsmpCr"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"f5uR6k6PqT"}],"key":"RL90WIDWkb"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"o2TtoPntj2"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"rVPJEtRkHZ"}],"key":"TQbm5QBeaN"},{"type":"text","value":",\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"fMXAkf4TBP"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"state-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"rDZSknNAwW"}],"key":"dSfuratDm5"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"B0UuUHjcue"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"history-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"fjvJ6wEfp5"}],"key":"QU1HmWeFrQ"},{"type":"text","value":", ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"jGUlEuyMkk"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"Jx3M3EOBtf"}],"key":"CKFB0BTQG8"},{"type":"text","value":" or\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"ksFFyUwg3B"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"p7oDcZbid9"}],"key":"T5IwdOonTT"},{"type":"text","value":".","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"IB5jHISrFM"}],"key":"sXXUNCXsJW"}],"key":"Sic6094rvO"},{"type":"listItem","spread":true,"position":{"start":{"line":1536,"column":1},"end":{"line":1537,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"A policy induces a distribution over ","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"FC3jO8xnPI"},{"type":"strong","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"trajectories","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"LGpNh4jW9p"}],"key":"Q6McTVO4cQ"},{"type":"text","value":".","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"rHxGaYdSFH"}],"key":"F2F0BbviJT"}],"key":"jaR3SRS5Pk"},{"type":"listItem","spread":true,"position":{"start":{"line":1538,"column":1},"end":{"line":1545,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1538,"column":1},"end":{"line":1544,"column":1}},"children":[{"type":"text","value":"We can evaluate a policy by computing its ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"VHFtyLrUCC"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"G3W030dc1o"}],"key":"XJMuZbxwKN"},{"type":"text","value":"\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"WZBELAfw6Z"},{"type":"inlineMath","value":"V^\\pi(s)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Vπ(s)V^\\pi(s)Vπ(s)","key":"Kt1KfU6KMt"},{"type":"text","value":", which is the expected total reward starting from state\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"iJIjyjsIh2"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"c9FMVeSuKW"},{"type":"text","value":" and following policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"iz7UbxBq9D"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"ZxPHcQ2WaA"},{"type":"text","value":". We can also compute the\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"HjNcHtshAi"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"state-action value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"qNb9wdORZB"}],"key":"On2Pd1qoA9"},{"type":"text","value":" ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"dU7j3AXOLD"},{"type":"inlineMath","value":"Q^\\pi(s, a)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Qπ(s,a)Q^\\pi(s, a)Qπ(s,a)","key":"ck0cZjDBaE"},{"type":"text","value":", which is the expected\ntotal reward starting from state ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"GDWwzolFqm"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"Eqtaku2Lb2"},{"type":"text","value":", taking action ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"CADJV1cNDv"},{"type":"inlineMath","value":"a","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"aaa","key":"DiBbJGKEaU"},{"type":"text","value":", and then\nfollowing policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"pYc1NjiTn5"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"tbHSE1NFnN"},{"type":"text","value":". In the finite-horizon setting, these also\ndepend on the timestep ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"oeGRPl5S9G"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"h\\hih","key":"K9YxVGPo7A"},{"type":"text","value":".","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"DOsGVtNOzX"}],"key":"eX60YFmrj8"}],"key":"mBgE03Qsgn"},{"type":"listItem","spread":true,"position":{"start":{"line":1546,"column":1},"end":{"line":1550,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1546,"column":1},"end":{"line":1549,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"xejRquKoGl"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"W46etMBUdF"}],"key":"I4UIDFkZ4c"},{"type":"text","value":" is an equation that the value\nfunction must satisfy. It can be used to solve for the value\nfunctions exactly. Thinking of the r.h.s. of this equation as an\noperator on value functions gives the ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"jQfLbkJICd"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"TCrCeKJMJ0"}],"key":"ds7E4YxCtH"},{"type":"text","value":".","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"EdqKmB1ZN1"}],"key":"cf60wjUPIL"}],"key":"wGFWMoYavM"},{"type":"listItem","spread":true,"position":{"start":{"line":1551,"column":1},"end":{"line":1553,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1551,"column":1},"end":{"line":1552,"column":1}},"children":[{"type":"text","value":"In the finite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"BZVCOijMBe"},{"type":"strong","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"Cd02CCmCha"}],"key":"oDjibXK0tM"},{"type":"text","value":".","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"fUA0uFCXwh"}],"key":"hrE7u08WoC"}],"key":"onlmegcBsk"},{"type":"listItem","spread":true,"position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"text","value":"In the infinite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"waES3WPrN7"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"zyOCmKBb9L"}],"key":"FnvozxHcsb"},{"type":"text","value":" or ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"g5fx72XxT9"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"XY4r6obX0E"}],"key":"h9nY3oON6u"},{"type":"text","value":".","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"BIpWIJ5gw8"}],"key":"NFZnt16suN"}],"key":"pWHNCUPESu"}],"key":"hf6FbLeUOI"}],"key":"kUmXptYBN0"}],"key":"P3It3jmQBx"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"CS/STAT 184: Introduction to Reinforcement Learning","url":"/","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"32c2f6fe9e96648ecf8985a4e80db115d0d6950b01e46976348cc5f4529cd76f","slug":"mdps","location":"/mdps.md","dependencies":[],"frontmatter":{"title":"1 Markov Decision Processes","numbering":{"all":{"enabled":true},"enumerator":{"template":"1.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","thumbnailOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp","exports":[{"format":"md","filename":"mdps.md","url":"/build/mdps-eb86bf115f025d31fd89a81ae9f29e0d.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"QAJ47NVJ2e"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"1.1","key":"YwkImtjGje"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"The field of RL studies how an agent can learn to make sequential decisions in an interactive environment.\nThis is a very general problem!\nHow can we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"gAk5uqJbBY"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"formalize","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ZhaIbgzD5V"}],"key":"l0VquglOiZ"},{"type":"text","value":" this task in a way that is both ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"lsg2v8KT8Q"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"sufficiently general","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"d8kSa81KyS"}],"key":"kAezd8rLgB"},{"type":"text","value":" yet also tractable enough for ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"wnCeMdJgMq"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"fruitful analysis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"IWpf8TNY29"}],"key":"U0oAsnunZ5"},{"type":"text","value":"?","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"bf5K3N7xvr"}],"key":"UHBjldtajv"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"aoCkCjRoRr"}],"key":"FAJQfeK17E"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":26,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"strong","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"Board games and video games,","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"vWoOPEvRve"}],"key":"bAqv4CR5WD"},{"type":"text","value":" where a player takes actions in a virtual environment.","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"hrwpRFk4XJ"}],"key":"gd8Bh4HDsJ"},{"type":"listItem","spread":true,"position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Inventory management,","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"sDnXjgA0nL"}],"key":"O3mYKyeCox"},{"type":"text","value":" where a company must efficiently move resources from producers to consumers.","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"OPC3tGtCPM"}],"key":"tFRfkSpJZi"},{"type":"listItem","spread":true,"position":{"start":{"line":28,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Robotic control","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"oFNXomSU5u"}],"key":"kNJKIJ6GSh"},{"type":"text","value":", where a robot can move and interact with the real world to complete some task.","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"ovLyfJE6aJ"}],"key":"qQkiBt3Upn"}],"key":"z5NEfFctaA"},{"type":"paragraph","position":{"start":{"line":30,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"In these environments and many others, the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"igdPEkY5O6"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"TPU0k8Vdv1"}],"key":"OktC737tpp"},{"type":"text","value":",\nthe “rules” of the environment,\nonly depend on the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"tQIaXakFG7"},{"type":"emphasis","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"most recent","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"yejg4EusaG"}],"key":"NVpJAcNgyg"},{"type":"text","value":" state and action (generally speaking).\nFor example, if you want to take a break while playing a game of chess,\nyou could take a picture of the board,\nand later on reset the board to that state and continue playing;\nthe past history of moves doesn’t matter (generally speaking).\nThis is called the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"Rws0fvpZqo"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"Markov property.","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"lE03lqsBVd"}],"key":"Xq5hXgma9B"}],"key":"Rb5ZHXaB9r"},{"type":"proof","kind":"definition","label":"markov","identifier":"markov","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Markov property","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"DMwpY8Tw98"}],"key":"Z6YoMw57gU"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"An interactive environment satisfies the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"GcZ1s0OQMh"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"Markov property","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"wbSAIbmnBD"}],"key":"izG9JNkdOg"},{"type":"text","value":" if the\nprobability of transitioning to a new state only depends on the current\nstate and action:","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Z1ZTEBX7CV"}],"key":"jcxB688ddN"},{"type":"math","value":"\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)","enumerator":"1.1","key":"p5kWz5ZHaf"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"ve4wXXC46B"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"iLJFUWhgUo"},{"type":"text","value":" describes the state transitions.\n(We’ll elaborate on this notation later in the chapter.)","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"L0LBM1AdOZ"}],"key":"noq6a4naEw"}],"enumerator":"1.1","html_id":"markov","key":"YJGh1Z5lPz"},{"type":"paragraph","position":{"start":{"line":52,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"Environments that satisfy the Markov property are called ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"Ef6ZTcOnzI"},{"type":"strong","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"HzVtI61cF2"}],"key":"mwex4J9tWD"},{"type":"text","value":" (MDPs).\nThis chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"Usmv9D67Xc"}],"key":"vZmHt5zoy8"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"BQzVF6zlX8"}],"key":"D3FyAnc9P5"},{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":58,"column":1}},"children":[{"type":"text","value":"What information might be encoded in the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"kSlAFwh9tF"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"ItQvIUWC7f"}],"key":"jYd1GDRwww"},{"type":"text","value":" for each of the above examples?\nWhat might the valid set of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"NKF1JT3BQH"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"nJN6w8PAdv"}],"key":"JQ1PEb5jiE"},{"type":"text","value":" be?\nDescribe the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"gX1niptpY7"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"PwKyouz0ES"}],"key":"Fj9RiTt9SJ"},{"type":"text","value":" heuristically and verify that they satisfy the Markov property.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"VyXu82mquZ"}],"key":"bcwA7nFBlv"}],"key":"rtsJ1yvCci"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"MDPs are usually classified as ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"IY2jyfFqVJ"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"finite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"r1QBRrOaez"}],"key":"HMNdO6FtUR"},{"type":"text","value":", where the interactions end after some finite number of time steps,\nor ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"jtge0JiqGy"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"infinite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"Xt1mqRbRR5"}],"key":"vWbbnfkYGB"},{"type":"text","value":", where the interactions can continue indefinitely.\nWe’ll begin with the finite-horizon case and discuss the infinite-horizon case in the second half of the chapter.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"JQF2aXMSDd"}],"key":"qU1BCkb6oP"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"We’ll describe how to ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"uJItuS1QEO"},{"type":"emphasis","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"NMNAQpoFkl"}],"key":"ERc00j0lD5"},{"type":"text","value":" different strategies, called ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"c5QFf1MJOx"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"policies,","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"dlPr7LRpJO"}],"key":"xtWJLJWMqo"},{"type":"text","value":" and how to compute (or approximate)\nthe ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"KhXHGlFOiV"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"hdelaD3iIM"}],"key":"FlsugMJ262"},{"type":"text","value":" for a given MDP.\nWe’ll introduce the ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"yJmRBBYt7o"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"Bellman consistency condition","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"wQEK3m50eI"}],"key":"uF8IFarWKH"},{"type":"text","value":", which allows us to analyze the whole sequence of interactions in terms of individual timesteps.","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"Uyur9rdMg6"}],"key":"k4kiYSELEE"}],"key":"hf3p76PExN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import NamedTuple, Float, Array, partial, jax, jnp, latexify","key":"h6XQoXdD0T"},{"type":"output","id":"Pk6hHeWLnMBjg3fYOQgNo","data":[],"key":"Xk3u0a4nOk"}],"data":{},"key":"RYpOHuSp5D"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"Finite-horizon MDPs","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"uqzwzKwecE"}],"identifier":"finite-horizon-mdps","label":"Finite-horizon MDPs","html_id":"finite-horizon-mdps","implicit":true,"enumerator":"1.2","key":"iG9UCQssBl"},{"type":"heading","depth":3,"position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"Definition","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"lWiEA8uDVm"}],"identifier":"definition","label":"Definition","html_id":"definition","implicit":true,"enumerator":"1.2.1","key":"H3vHjCQ72w"},{"type":"proof","kind":"definition","label":"finite_horizon_mdp","identifier":"finite_horizon_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Finite-horizon Markov decision process","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"F18ZPqWHEh"}],"key":"TXjXhpikez"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"The components of a finite-horizon Markov decision process are:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"TJoqxfwClm"}],"key":"syjqAE2bmi"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":82,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":82,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":82,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"tak4cBa7pQ"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"TfumiyQ6pL"}],"key":"JzxFTDCIQH"},{"type":"text","value":" that the agent interacts with. We use ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"VhuBbZk2PF"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"html":"S\\mathcal{S}S","key":"FjDnbFJqqk"},{"type":"text","value":" to denote\nthe set of possible states, called the ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"reGUFtCfpk"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state space","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"DfLlsrbkUY"}],"key":"T9Pe2TtE6H"},{"type":"text","value":".","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"E1oFMRLZrY"}],"key":"WivMjke8ZM"}],"key":"AxktF3VctA"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":85,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"e2hZlsHLj4"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"sbg1VQzGFy"}],"key":"chp8C6Ktla"},{"type":"text","value":" that the agent can take. We use ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"qCYpePUXb2"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"A\\mathcal{A}A","key":"JcJstBzzrW"},{"type":"text","value":" to denote the\nset of possible actions, called the ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"HHPIraiDz8"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"action space","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"mgFNlzXqTV"}],"key":"W1v9OWo9nf"},{"type":"text","value":".","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"zErOVWL4tc"}],"key":"fbd7y3HXZ6"}],"key":"Tc06mnzUbc"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":89,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"Some ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"VvxeeTTFqU"},{"type":"strong","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"initial state distribution","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"sdXNLHi6UK"}],"key":"VQJLik6qS0"},{"type":"text","value":" ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"JPxV86UA5w"},{"type":"inlineMath","value":"\\mu \\in \\triangle(\\mathcal{S})","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"μ(S)\\mu \\in \\triangle(\\mathcal{S})μ(S)","key":"yO1AfWaKJp"},{"type":"text","value":".","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"kPDkT8Lz7w"}],"key":"kTZdltJwXn"}],"key":"DEfObGb8GV"},{"type":"listItem","spread":true,"position":{"start":{"line":90,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":90,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"i3H4Py2TWP"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"k8vr8jzvig"}],"key":"XiMfRUQx37"},{"type":"text","value":" (a.k.a. ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"zmiYjcEmcG"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"scbNLJMDPF"}],"key":"hkd006fjc5"},{"type":"text","value":")\n","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"XKn6sNpLjR"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"gr0oDxvhMw"},{"type":"text","value":" that describe what state the agent\ntransitions to after taking an action.","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"KRIKg39wQC"}],"key":"FnUDLAIi9O"}],"key":"XKyFFHLUlO"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"LDJZNYZ2ds"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"jWp0JYSDyd"}],"key":"nWrgRXhnwx"},{"type":"text","value":" signal. In this course we’ll take it to be a\ndeterministic function on state-action pairs,\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"uoZoI1qiKn"},{"type":"inlineMath","value":"r : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r:S×ARr : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}r:S×AR","key":"imvw0GyiKU"},{"type":"text","value":", but in general many results will\nextend to a ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"VVVaPNO9Tp"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"wX0vSZWc4s"}],"key":"dPTPIWBLAR"},{"type":"text","value":" reward signal.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"W5BzKKYUFN"}],"key":"yt44wkgX9y"}],"key":"YP8o5YxHbQ"},{"type":"listItem","spread":true,"position":{"start":{"line":99,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":99,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"text","value":"A time horizon ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"ypDUYv7UZe"},{"type":"inlineMath","value":"\\hor \\in \\mathbb{N}","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"html":"HN\\hor \\in \\mathbb{N}HN","key":"PK7tvj7B9H"},{"type":"text","value":" that specifies the number of\ninteractions in an ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"D4pLnJ9AAc"},{"type":"strong","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"children":[{"type":"text","value":"episode","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"ZPD8kB1wkl"}],"key":"nsgWhdqqpO"},{"type":"text","value":".","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"elStbc2tUC"}],"key":"eFFObekraK"}],"key":"HrMm00H9S5"}],"key":"iiJcQfXn6R"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Combined together, these objects specify a finite-horizon Markov\ndecision process:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"bq3mPqKJNb"}],"key":"w9xBWG5g5Q"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).","position":{"start":{"line":105,"column":1},"end":{"line":105,"column":1}},"html":"M=(S,A,μ,P,r,H).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).M=(S,A,μ,P,r,H).","enumerator":"1.2","key":"eufAnvAsGi"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"When there are ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"wKpQnUXN8R"},{"type":"strong","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"finitely","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"UMYPBsOLuF"}],"key":"S3ctMG1LCd"},{"type":"text","value":" many states and actions, i.e.\n","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"HN28941HmQ"},{"type":"inlineMath","value":"|\\mathcal{S}|, |\\mathcal{A}| < \\infty","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"html":"S,A<|\\mathcal{S}|, |\\mathcal{A}| < \\inftyS,A<","key":"ls4DfkY9xL"},{"type":"text","value":", we can express\nthe relevant quantities as vectors and matrices (i.e. ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"ga5DVOS9bz"},{"type":"emphasis","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"tables","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"IdgppkL4Hn"}],"key":"TdXbCKAVG9"},{"type":"text","value":" of\nvalues):","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"UpN0MGG8Ph"}],"key":"mspoBT8WY5"},{"type":"math","value":"\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}","position":{"start":{"line":112,"column":1},"end":{"line":118,"column":1}},"html":"μ[0,1]SP[0,1](S×A)×SrRS×A\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}μ[0,1]SP[0,1](S×A)×SrRS×A","enumerator":"1.3","key":"OnUm2me1nu"}],"enumerator":"1.2","html_id":"finite-horizon-mdp","key":"J5zDySeaAU"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"pHTxTU5BaH"}],"key":"iVFUs67ULy"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Verify that the types and shapes provided above make sense!","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"LfrtoTJ7hV"}],"key":"doT7KFPLM2"}],"key":"PXLjqMd2fX"}],"key":"mqI4kCAT5E"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MDP(NamedTuple):\n \"\"\"A description of a Markov decision process with finitely many states and actions.\"\"\"\n S: int # number of states\n A: int # number of actions\n μ: Float[Array, \" S\"]\n P: Float[Array, \"S A S\"] # \"current\" state, \"current\" action, \"next\" state\n r: Float[Array, \"S A\"]\n H: int\n γ: float = 1.0 # discount factor (used later)","key":"jJno5x1oh1"},{"type":"output","id":"mzvnUaVALY7OepolD89HX","data":[],"key":"tIoCyVKWBK"}],"data":{},"key":"AJ5v8OJXNX"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_mdp","identifier":"tidy_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":137,"column":1},"end":{"line":137,"column":1}},"key":"yYz9OHtAF0"}],"key":"Dsl4OOz5RR"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"Let’s consider a simple decision problem throughout this chapter:\nthe task of keeping your room tidy!","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"l7JOeEJYMD"}],"key":"SpkVXbrvKp"},{"type":"paragraph","position":{"start":{"line":143,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"Your room has the possible states\n","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"p8cX8EOL7H"},{"type":"inlineMath","value":"\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"S={orderly,messy}.\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.S={orderly,messy}.","key":"dyRJHktI8k"},{"type":"text","value":"\nYou can take either of the actions ","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"cY3aYbm1jM"},{"type":"inlineMath","value":"\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"A={ignore,tidy}.\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.A={ignore,tidy}.","key":"n7kwWdkTnh"},{"type":"text","value":"\nThe room starts off orderly.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"nX6UOiAoq6"}],"key":"reHkSrRwlq"},{"type":"paragraph","position":{"start":{"line":148,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"q51lapGqUk"},{"type":"strong","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"posxycDpCD"}],"key":"SBegE1tXMt"},{"type":"text","value":" are as follows:\nif you tidy the room, it becomes (or remains) orderly;\nif you ignore the room, it ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"PuVtsOl29M"},{"type":"emphasis","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"might","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"WEG4sOBC3z"}],"key":"Muwd8AbMOa"},{"type":"text","value":" become messy (see table below).","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"Fva98bDSVZ"}],"key":"f1XAbS35uB"},{"type":"paragraph","position":{"start":{"line":152,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"rdyo1qORJZ"},{"type":"strong","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"rewards","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"OzbjKMinDS"}],"key":"phhkA3olKs"},{"type":"text","value":" are as follows: You get penalized for tidying an orderly room (a waste of time) or ignoring a messy room,\nbut you get rewarded for ignoring an orderly room (since you can enjoy your additional time).\nTidying a messy room is a chore that gives no reward.","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"FmhRsBUagr"}],"key":"SjqTASxDhK"},{"type":"paragraph","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"These are summarized in the following table:","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"Z0TBLMsESq"}],"key":"BtLZ0MHAed"},{"type":"math","value":"\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}","position":{"start":{"line":158,"column":1},"end":{"line":164,"column":1}},"html":"saP(orderlys,a)P(messys,a)r(s,a)orderlyignore0.70.31orderlytidy101messyignore011messytidy100\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}sorderlyorderlymessymessyaignoretidyignoretidyP(orderlys,a)0.7101P(messys,a)0.3010r(s,a)1110","enumerator":"1.4","key":"GY9e7ClPrO"},{"type":"paragraph","position":{"start":{"line":166,"column":1},"end":{"line":167,"column":1}},"children":[{"type":"text","value":"Consider a time horizon of ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"RTtvBgjgOz"},{"type":"inlineMath","value":"\\hor = 7","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"H=7\\hor = 7H=7","key":"aU4cPbVVuz"},{"type":"text","value":" days (one interaction per day). Let\n","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"WCnE5SWin9"},{"type":"inlineMath","value":"t = 0","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=0t = 0t=0","key":"XdrYmX82yd"},{"type":"text","value":" correspond to Monday and ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"G6xgNkjShv"},{"type":"inlineMath","value":"t = 6","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=6t = 6t=6","key":"gXnBTTh4si"},{"type":"text","value":" correspond to Sunday.","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"eMurmRwzwD"}],"key":"u9yoyrdinu"}],"enumerator":"1.1","html_id":"tidy-mdp","key":"i2hpL8zfY2"}],"key":"isjCa0d84w"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp = MDP(\n S=2, # 0 = orderly, 1 = messy\n A=2, # 0 = ignore, 1 = tidy\n μ=jnp.array([1.0, 0.0]), # start in orderly state\n P=jnp.array([\n [\n [0.7, 0.3], # orderly, ignore\n [1.0, 0.0], # orderly, tidy\n ],\n [\n [0.0, 1.0], # messy, ignore\n [1.0, 0.0], # messy, tidy\n ],\n ]),\n r=jnp.array([\n [\n 1.0, # orderly, ignore\n -1.0, # orderly, tidy\n ],\n [\n -1.0, # messy, ignore\n 0.0, # messy, tidy\n ]\n ]),\n H=7,\n)","key":"Lb8W0azAKb"},{"type":"output","id":"iR9w1Kad3iw4xP_WLUdM6","data":[],"key":"WuLNcoIEtH"}],"data":{},"key":"qDSv7s7wDz"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"Policies","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"BNMznuuKtI"}],"identifier":"policies","label":"Policies","html_id":"policies","implicit":true,"enumerator":"1.2.2","key":"WIpppZSqmv"},{"type":"proof","kind":"definition","label":"policy","identifier":"policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"IqvxMhL3Mn"}],"key":"t509Uhj9GN"},{"type":"paragraph","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"Yg7HnvW68w"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"AHaDxGPqUm"}],"key":"RZnAJy51nv"},{"type":"text","value":" ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"Qlp7ld3iq9"},{"type":"text","value":"π","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"hzaEwQLE0w"},{"type":"text","value":" describes the agent’s strategy:\nwhich actions it takes in a given situation.\nA key goal of RL is to find the ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"gEwofpWFhJ"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"IEhT152Bqd"}],"key":"tZzHqkMoio"},{"type":"text","value":" that maximizes the total reward on average.","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"bZZq2GlzKq"}],"key":"RE4DCskPke"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"There are three axes along which policies can vary: their outputs,\ninputs, and time-dependence.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"asH4jNICZk"}],"key":"DCwgF1DrOn"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"strong","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"Deterministic or stochastic.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"j0HAV2tWfT"}],"key":"AFhkSlvOpD"},{"type":"text","value":" A deterministic policy outputs\nactions while a stochastic policy outputs ","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"rmAycrPi52"},{"type":"emphasis","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"distributions","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"kBI4zuYMRL"}],"key":"MRCSWe9iW3"},{"type":"text","value":" over\nactions.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"G66UJHvDjD"}],"key":"X4PSWNRI34"}],"key":"YEak12Jvzc"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","alt":"A deterministic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"hoDZDSn3LR","urlSource":"./shared/deterministic_policy.png","urlOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"children":[{"type":"text","value":"A deterministic policy.","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"key":"pqsKmWv9O2"}],"key":"YqwV7EsC9S"}],"key":"o2JXAAZXXm"}],"enumerator":"1.1","key":"KtFfzNewmq"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.png","alt":"A stochastic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"uNHa1RCKTh","urlSource":"./shared/stochastic_policy.png","urlOptimized":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"A stochastic policy.","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"vklfn2Pru4"}],"key":"u9JvoX70oO"}],"key":"zejyDXYB48"}],"enumerator":"1.2","key":"pXIoE4bom5"},{"type":"list","ordered":true,"start":2,"spread":false,"position":{"start":{"line":227,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":227,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"strong","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"children":[{"type":"text","value":"State-dependent or history-dependent.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"bRU6tjISMY"}],"key":"XuBJB2zeqM"},{"type":"text","value":" A state-dependent (a.k.a.\n“Markovian”) policy only depends on the current state, while a\nhistory-dependent policy depends on the sequence of past states,\nactions, and rewards. We’ll only consider state-dependent policies\nin this course.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"omqrv2m9FN"}],"key":"mzsOQMRphw"}],"key":"b228b6Ekw0"},{"type":"listItem","spread":true,"position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"strong","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Stationary or time-dependent.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"y326mjb4ve"}],"key":"blCi32feZx"},{"type":"text","value":" A stationary (a.k.a. time-homogeneous) policy\nremains the same function at all time steps, while a time-dependent policy can depend on the current timestep.\nFor consistency with states and actions, we will denote the timestep as a subscript,\ni.e. ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"wYg1e5UeTT"},{"type":"inlineMath","value":"\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"π={π0,,πH1}.\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.π={π0,,πH1}.","key":"miyjIE4xXe"}],"key":"j84VphrvwA"}],"key":"ZBmBlGgVNP"}],"key":"dCeFM1w1nz"}],"enumerator":"1.3","html_id":"policy","key":"p1rodBmB2K"}],"key":"vufHjOdmGA"},{"type":"block","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":241,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Note that for finite state and action spaces,\nwe can represent a randomized mapping ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"kWnwzY8Z3V"},{"type":"inlineMath","value":"\\mathcal{S} \\to \\Delta(\\mathcal{A})","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"SΔ(A)\\mathcal{S} \\to \\Delta(\\mathcal{A})SΔ(A)","key":"w17yjY8sO0"},{"type":"text","value":"\nas a matrix ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"fbrYUrVsHq"},{"type":"inlineMath","value":"\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"π[0,1]S×A\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}π[0,1]S×A","key":"mx2jd9lAzH"},{"type":"text","value":" where each row describes\nthe policy’s distribution over actions for the corresponding state.","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"bcHrkazAG8"}],"key":"EXORPvA53d"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy!\nIntuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision.\nWe’ll prove this result constructively later in the chapter.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"GSvTE2vRgd"}],"key":"urVYK7MWTd"},{"type":"proof","kind":"example","label":"tidy_policy","identifier":"tidy_policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies for the tidying MDP","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"zkd0FNjf9r"}],"key":"kUrHlbYNmc"},{"type":"paragraph","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"text","value":"Here are some possible policies for the tidying MDP ","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"W6wpjnuU2F"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_mdp","label":"tidy_mdp","children":[{"type":"text","value":"Example ","key":"FDR8O14Klm"},{"type":"text","value":"1.1","key":"bZrusmlBuE"}],"template":"Example %s","enumerator":"1.1","resolved":true,"html_id":"tidy-mdp","key":"k5dFqwDpO0"},{"type":"text","value":":","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"pOjsthzsed"}],"key":"IS0vZkesxE"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":255,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":255,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"children":[{"type":"text","value":"Always tidy: ","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"zds4L2wRPN"},{"type":"inlineMath","value":"\\pi(s) = \\text{tidy}","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"html":"π(s)=tidy\\pi(s) = \\text{tidy}π(s)=tidy","key":"Ja71AnHu2e"},{"type":"text","value":".","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"BUifrGRTqu"}],"key":"qZnkPIyxyO"}],"key":"fUH5MOb0aY"},{"type":"listItem","spread":true,"position":{"start":{"line":257,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":257,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"Only tidy on weekends: ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"IL1piWUzRu"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{tidy}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=tidy\\pi_\\hi(s) = \\text{tidy}πh(s)=tidy","key":"Egni4YDPoq"},{"type":"text","value":" if\n","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"qk1aBTL9JI"},{"type":"inlineMath","value":"\\hi \\in \\{ 5, 6 \\}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"h{5,6}\\hi \\in \\{ 5, 6 \\}h{5,6}","key":"lz2IZpwiNq"},{"type":"text","value":" and ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"vYO45vYPzu"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{ignore}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=ignore\\pi_\\hi(s) = \\text{ignore}πh(s)=ignore","key":"dWI38FdvVz"},{"type":"text","value":" otherwise.","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"V9OEAx4NSU"}],"key":"nekUnThMKv"}],"key":"ov8ScCwf3M"},{"type":"listItem","spread":true,"position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"text","value":"Only tidy if the room is messy: ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"levYisn6fk"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{messy}) = \\text{tidy}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(messy)=tidy\\pi_\\hi(\\text{messy}) = \\text{tidy}πh(messy)=tidy","key":"T5z7xPwVsl"},{"type":"text","value":"\nand ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"Y4sDc6SnCZ"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{orderly}) = \\text{ignore}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(orderly)=ignore\\pi_\\hi(\\text{orderly}) = \\text{ignore}πh(orderly)=ignore","key":"LzpbgAh4vZ"},{"type":"text","value":" for all ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"h7YExN5vgI"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"h\\hih","key":"QiBT3IV7gd"},{"type":"text","value":".","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"sE3tTqkH03"}],"key":"uU1ZEirMfk"}],"key":"siCy81Ztw2"}],"key":"nM5dwuU5rL"}],"enumerator":"1.2","html_id":"tidy-policy","key":"XKNzF8LnxE"}],"key":"Zv3F4CktAi"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# arrays of shape (H, S, A) represent time-dependent policies\ntidy_policy_always_tidy = (\n jnp.zeros((7, 2, 2))\n .at[:, :, 1].set(1.0)\n)\ntidy_policy_weekends = (\n jnp.zeros((7, 2, 2))\n .at[5:7, :, 1].set(1.0)\n .at[0:5, :, 0].set(1.0)\n)\ntidy_policy_messy_only = (\n jnp.zeros((7, 2, 2))\n .at[:, 1, 1].set(1.0)\n .at[:, 0, 0].set(1.0)\n)","key":"H7OMxk4c61"},{"type":"output","id":"ml0ab07MTrMwSZ-XaKG0V","data":[],"key":"N3owY2U2KV"}],"data":{},"key":"ciEe2l1kEZ"},{"type":"block","children":[{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"EXLAGNo4DT"}],"key":"lVhQFUqJ1F"},{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":285,"column":1}},"children":[{"type":"text","value":"Array objects in Jax are ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"sfvU4xw4me"},{"type":"strong","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"immutable,","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"EAjeVohfmM"}],"key":"FMhmNAKN3m"},{"type":"text","value":" that is, they cannot be ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"skTPURrHBS"},{"type":"emphasis","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"changed.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"Jm7iRoAkm8"}],"key":"rPYy5anH1S"},{"type":"text","value":"\nThis might seem inconvenient, but in larger projects,\nimmutability makes code much easier to reason about.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"ReDOVqxzim"}],"key":"ZcqOkkhRgo"}],"key":"p1EM7TJt3t"}],"key":"tnPO4osUdc"},{"type":"block","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Trajectories","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"aQBSVgYRN9"}],"label":"trajectories","identifier":"trajectories","html_id":"trajectories","enumerator":"1.2.3","key":"LXwacCtMlf"},{"type":"proof","kind":"definition","label":"trajectory","identifier":"trajectory","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"KcYeY3Fuav"}],"key":"yxik7kpFkr"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"A sequence of states, actions, and rewards is called a ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"G4fkO3iqv5"},{"type":"strong","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"YJdVAObrp1"}],"key":"CcPlMa2sEB"},{"type":"text","value":":","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"IltO9RABJi"}],"key":"nGr5uGGxg8"},{"type":"math","value":"\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"τ=(s0,a0,r0,,sH1,aH1,rH1)\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})τ=(s0,a0,r0,,sH1,aH1,rH1)","enumerator":"1.5","key":"DYmmTFVhf7"},{"type":"paragraph","position":{"start":{"line":300,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"raT35Ak11w"},{"type":"inlineMath","value":"r_\\hi = r(s_\\hi, a_\\hi)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"rh=r(sh,ah)r_\\hi = r(s_\\hi, a_\\hi)rh=r(sh,ah)","key":"gLHvuudhHX"},{"type":"text","value":".\n(Note that some sources omit the reward at the final time step. This is a minor detail.)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"a9JWh2RJ0V"}],"key":"Ug3bDh5MIt"}],"enumerator":"1.4","html_id":"trajectory","key":"IM5Yup7puL"}],"key":"O2L5H9mHBT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Transition(NamedTuple):\n \"\"\"A single state-action-reward interaction with the environment.\n\n A trajectory comprises a sequence of transitions.\n \"\"\"\n s: int\n a: int\n r: float","key":"a6LoRNEBnX"},{"type":"output","id":"2E7iizq9o92VpiPyHVLND","data":[],"key":"DIXTcL4cWZ"}],"data":{},"key":"LXKUzXUIPq"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Once we’ve chosen a policy,\nwe can sample trajectories by repeatedly choosing actions according to the policy,\ntransitioning according to the state transitions, and observing the rewards.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"ZykA1Ahipp"}],"key":"VUmVdj7hp4"},{"type":"image","url":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.png","width":"240px","align":"center","key":"EhVUmKm1Iz","urlSource":"shared/trajectory.png","urlOptimized":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.webp"},{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"text","value":"That is, a policy induces a distribution ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"KiFBLtA534"},{"type":"inlineMath","value":"\\rho^{\\pi}","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"ρπ\\rho^{\\pi}ρπ","key":"PxUHmwBdqM"},{"type":"text","value":" over trajectories.\n(We assume that ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"jIjjwecRae"},{"type":"text","value":"μ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"hdsNdivGQR"},{"type":"text","value":" and ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"FMt3tnef2v"},{"type":"inlineMath","value":"P","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"PPP","key":"hZgb6vWhr5"},{"type":"text","value":" are clear from context.)","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"HwEt1W8jid"}],"key":"XcDPs7Dm6M"},{"type":"proof","kind":"example","label":"tidy_traj","identifier":"tidy_traj","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories in the tidying environment","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"GHJCRP3Sn7"}],"key":"hMlA3ayKZQ"},{"type":"paragraph","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"children":[{"type":"text","value":"Here is a possible trajectory for the tidying example:","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"key":"oIcrZ2j3Nq"}],"key":"ZlnaUkUbTz"},{"type":"container","kind":"table","children":[{"type":"table","position":{"start":{"line":333,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"h\\hih","key":"m9BQlrSLve"}],"key":"z2xyeMugdT"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"RgeJ9mnjKu"}],"key":"zm5d2sitia"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"BNCYAIk8Ap"}],"key":"Y35EgA691k"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"2","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"AZr5iSeLzT"}],"key":"ZzknCQF1zc"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"nTI36G8hbd"}],"key":"L3yjvBFbKw"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"fbxsMNtQWf"}],"key":"R1acl3loqQ"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"TNVszDr5Op"}],"key":"CG85mKyAS0"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"6","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"MYudHrbZxE"}],"key":"dNH0cHH4va"}],"key":"mrGQ65rxX4"},{"type":"tableRow","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"sss","key":"u5zwfU4aSJ"}],"key":"CjxbV7La6B"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"umxT61MIKd"}],"key":"HHQwBrpZGQ"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"vpxUISFtXT"}],"key":"KqcmiVxZeM"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"XO9DgAInkq"}],"key":"cUSbRpXrVK"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"P5Jo9thIHB"}],"key":"lGFejvI7sT"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"sYclhZJ4Vm"}],"key":"PxffCI6l4y"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"C9r5qxwwd8"}],"key":"KPmhCIEJki"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"iiXiXsbpKs"}],"key":"i66mQOTEHL"}],"key":"aTGj5JSdDy"},{"type":"tableRow","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"html":"aaa","key":"QveOMrayyj"}],"key":"g5DcB4TdoG"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"CKIZt18XaO"}],"key":"MHnajdmrCD"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"P86FrjnX4a"}],"key":"Wds8UbyKFW"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"Y7cQxjZTfm"}],"key":"c3az0RtuyA"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"Skv3NVNQdf"}],"key":"PQNoNydA6N"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"tN7u9e6Nvr"}],"key":"UocXlZTRGz"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"agxZ1MD4T8"}],"key":"y21LdRV4lP"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"RQhv7SwP5Y"}],"key":"PuHTY4nOqU"}],"key":"WJvNi9y9r0"},{"type":"tableRow","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"html":"rrr","key":"dVglnkvTBS"}],"key":"RbUcodUnbt"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"ZsXojoLc8Y"}],"key":"HP0FiLQ7Qt"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"qLAiTLZvYj"}],"key":"WiBQerIvCU"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"yWJI0IuydC"}],"key":"WO5p8192Ui"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"RuPWXkGYji"}],"key":"iL97O644r5"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"UkezKlQ0zc"}],"key":"dMq6AWybt9"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"Mt5cdBK4bN"}],"key":"bGGTjhGNYX"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"alh7CCd81z"}],"key":"i7yGZsbUEu"}],"key":"lr6doU75q5"}],"key":"QBPBKmGi8I"}],"enumerator":"1.1","key":"R75yGROeMM"},{"type":"paragraph","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"children":[{"type":"text","value":"Could any of the policies in ","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"oTxyH4d685"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"kV057X2bfL"},{"type":"text","value":"1.2","key":"xoW0KK1KXF"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"M9AKd4UACm"},{"type":"text","value":" have generated this trajectory?","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"lsiGAKpLed"}],"key":"uC64f8LZwh"}],"enumerator":"1.3","html_id":"tidy-traj","key":"YPw8WXYboD"},{"type":"paragraph","position":{"start":{"line":343,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"Note that for a state-dependent policy, using the Markov property ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"hRPLnTjHTj"},{"type":"crossReference","kind":"proof:definition","identifier":"markov","label":"markov","children":[{"type":"text","value":"Definition ","key":"PQHRlNdN01"},{"type":"text","value":"1.1","key":"f4spORheBi"}],"template":"Definition %s","enumerator":"1.1","resolved":true,"html_id":"markov","key":"v5v3uGqn2W"},{"type":"text","value":",\nwe can write down the likelihood function of this probability distribution in an ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"Suvcb3YFyE"},{"type":"strong","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"autoregressive","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"RI5yR6gALM"}],"key":"TN7M3E3I7H"},{"type":"text","value":" way (i.e. one timestep at a time):","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"bAGu1xIccJ"}],"key":"NvkQBZZIgz"},{"type":"proof","kind":"definition","label":"autoregressive_trajectories","identifier":"autoregressive_trajectories","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Autoregressive trajectory distribution","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"WrUnI8HMNc"}],"key":"QGAzd2QItS"},{"type":"math","value":"\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)","enumerator":"1.6","key":"h58CFI8W6T"}],"enumerator":"1.5","html_id":"autoregressive-trajectories","key":"yVuWR0xAdp"}],"key":"w6v6pfQC1C"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def trajectory_log_likelihood(\n mdp: MDP,\n τ: list[Transition],\n π: Float[Array, \"S A\"],\n) -> float:\n \"\"\"Compute the log-likelihood of a trajectory under a given MDP and policy.\"\"\"\n\n # initial distribution and action\n total = jnp.log(mdp.μ[τ[0].s])\n total += jnp.log(π[τ[0].s, τ[0].a])\n\n # remaining state transitions and actions\n for i in range(1, mdp.H):\n total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])\n total += jnp.log(π[τ[i].s, τ[i].a])\n\n return total","key":"RenoEWvZuT"},{"type":"output","id":"dszYr90dG_2Ak092bkQxX","data":[],"key":"MX1tPhzEof"}],"data":{},"key":"vv5fhEW7EN"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"Rh2ZMvwhhx"}],"key":"Rmlc7PIi0D"},{"type":"paragraph","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"How would you modify this to include stochastic rewards?","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"key":"FbpWwb35rD"}],"key":"pKYYya3MzV"}],"key":"uwjtvDjHzW"},{"type":"paragraph","position":{"start":{"line":376,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"For a deterministic policy ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"UKCwY1rJQI"},{"type":"text","value":"π","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"ECUOUpF1D2"},{"type":"text","value":", we have that ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"wgrepgvMjZ"},{"type":"inlineMath","value":"\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"πh(as)=I[a=πh(s)]\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]πh(as)=I[a=πh(s)]","key":"DxyRstynCn"},{"type":"text","value":";\nthat is, the probability of taking an action is ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"fnJbgUZBGM"},{"type":"text","value":"1","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"RafV3WM7mH"},{"type":"text","value":" if it’s the unique action prescribed by the policy for that state and ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"xNK7dMNSkU"},{"type":"text","value":"0","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"cvfBcfglio"},{"type":"text","value":" otherwise.\nIn this case, the only randomness in sampling trajectories comes from the initial state distribution ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"kZrtpqwFai"},{"type":"text","value":"μ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"R1n9hM9yGS"},{"type":"text","value":" and the state transitions ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"M7pVvXTHyk"},{"type":"inlineMath","value":"P","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"PPP","key":"w3UYbi06n5"},{"type":"text","value":".","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"czK1NgdZ9j"}],"key":"tB10JdaHpg"}],"key":"bksz4UzqDT"},{"type":"block","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"children":[{"type":"text","value":"Value functions","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"key":"Qq05CvH7k1"}],"identifier":"value-functions","label":"Value functions","html_id":"value-functions","implicit":true,"enumerator":"1.2.4","key":"thU9jIaJiR"},{"type":"paragraph","position":{"start":{"line":384,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"The main goal of RL is to find a policy that maximizes the expected total\nreward ","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"Je3ejjnlZS"},{"type":"inlineMath","value":"\\E [r_0 + \\cdots + r_{\\hor-1}]","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"html":"E[r0++rH1]\\E [r_0 + \\cdots + r_{\\hor-1}]E[r0++rH1]","key":"hzq1DmZr9l"},{"type":"text","value":".","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"IstSYQpSsZ"}],"key":"EE23LasPNL"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"wcxYTlmgZU"}],"key":"wpyEiYXMbq"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"Note that ","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"h7l6iW4Fze"},{"type":"inlineMath","value":"r_0 + \\cdots + r_{\\hor-1}","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"html":"r0++rH1r_0 + \\cdots + r_{\\hor-1}r0++rH1","key":"HjTwkbbGhj"},{"type":"text","value":" is a random variable.\nWhat sources of randomness does it depend on?\nDescribe the generating process.","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"EnUg8Yurbo"}],"key":"GZpi7OucxQ"}],"key":"hSmUtQ3Egr"},{"type":"paragraph","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"children":[{"type":"text","value":"Let’s introduce some notation for analyzing this quantity.","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"RjhWTMJpc6"}],"key":"Tws6dvIw7R"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"A policy’s ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"wCcxlyUQ5g"},{"type":"strong","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"d1CdanR105"}],"key":"G7ZegeDYqU"},{"type":"text","value":" at time ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"H5XevVf9ui"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"h\\hih","key":"MViz5Sbx2d"},{"type":"text","value":" is its expected remaining reward ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"RDDYcQ80Pt"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"from a given state","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"wgMDYPqra8"}],"key":"Nr28lqf09K"},{"type":"text","value":":","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"Ztls6fGsyD"}],"key":"P0ATYXXqds"},{"type":"proof","kind":"definition","label":"value","identifier":"value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value function","position":{"start":{"line":397,"column":1},"end":{"line":397,"column":1}},"key":"VxmImDL5OK"}],"key":"WQXxiTgei6"},{"type":"math","value":"V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"Vhπ(s):=Eτρπ[rh++rH1sh=s]V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]Vhπ(s):=Eτρπ[rh++rH1sh=s]","enumerator":"1.7","key":"i3Tg9pEWbz"}],"enumerator":"1.6","html_id":"value","key":"ftgVyN07RI"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"Similarly, we can define the ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"aBN5WicUZO"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"action-value function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"yWcNhfZDS0"}],"key":"kJVFlesCvH"},{"type":"text","value":" (aka the\n","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"CDrZhSsEjf"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Q-function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"izqhN2tO72"}],"key":"ZPzcEslLxL"},{"type":"text","value":") at time ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Tb54TaZq1E"},{"type":"inlineMath","value":"h","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"html":"hhh","key":"arRN5rMpDS"},{"type":"text","value":" as the expected remaining reward ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Kxdms1XXoR"},{"type":"emphasis","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"from a given state and taking a given action","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"hXrZlegK2L"}],"key":"H9nmW1qGbb"},{"type":"text","value":":","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"ukyZPKAfjC"}],"key":"pZXyKaZo74"},{"type":"proof","kind":"definition","label":"action_value","identifier":"action_value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Action-value function","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"u65r0pIdZH"}],"key":"s4L8jAp5yx"},{"type":"math","value":"Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"html":"Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]","enumerator":"1.8","key":"sOyNZarsox"}],"enumerator":"1.7","html_id":"action-value","key":"Y6akPE87DV"}],"key":"rGqXBS5x0W"},{"type":"block","position":{"start":{"line":412,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"Relating the value function and action-value function","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"flMXt6ItZ8"}],"identifier":"relating-the-value-function-and-action-value-function","label":"Relating the value function and action-value function","html_id":"relating-the-value-function-and-action-value-function","implicit":true,"enumerator":"1.2.4.1","key":"TYzhDzEUmu"},{"type":"paragraph","position":{"start":{"line":416,"column":1},"end":{"line":417,"column":1}},"children":[{"type":"text","value":"Note that the value function is just the expected action-value over\nactions drawn from the policy:","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"eoNOfER7o3"}],"key":"Vuf14ltKns"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]","position":{"start":{"line":419,"column":1},"end":{"line":419,"column":1}},"html":"Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]Vhπ(s)=Eaπh(s)[Qhπ(s,a)]","enumerator":"1.9","key":"KTu2RGsDYB"}],"key":"rhUjhi64X2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_v(\n policy: Float[Array, \"S A\"],\n q: Float[Array, \"S A\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Compute the value function for a given policy in a known finite MDP\n at a single timestep from its action-value function.\n \"\"\"\n return jnp.average(q, weights=policy, axis=1)","key":"XIcz9NLBn0"},{"type":"output","id":"eDiBC3NeqfcTrHPvjw6Tb","data":[],"key":"NbrFPaOClF"}],"data":{},"key":"d4V6K8kuUT"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":433,"column":1},"end":{"line":434,"column":1}},"children":[{"type":"text","value":"and the action-value is the sum of the immediate reward and the expected value of the following\nstate:","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"TZBMc0stiW"}],"key":"w0yzx0UXvP"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]","enumerator":"1.10","key":"SJgXG5MiHV"}],"key":"Wpe2XthFhU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def v_to_q(\n mdp: MDP,\n v_next: Float[Array, \" S\"],\n) -> Float[Array, \"S A\"]:\n \"\"\"\n Compute the action-value function in a known finite MDP\n at a single timestep from the corresponding value function.\n \"\"\"\n # the discount factor is relevant later\n return mdp.r + mdp.γ * mdp.P @ v_next\n\n\n# convert a list of v functions to a list of q functions\nv_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))","key":"VbcAjBac2s"},{"type":"output","id":"XB9p1De2paS08gkC0r2cT","data":[],"key":"bGo8MmxSfn"}],"data":{},"key":"Cf9LPJm2IW"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Greedy policies","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"oV3FsbXY05"}],"identifier":"greedy-policies","label":"Greedy policies","html_id":"greedy-policies","implicit":true,"enumerator":"1.2.4.2","key":"nAeiJ0xqPo"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"For any given ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"s9pOqDGgpx"},{"type":"inlineMath","value":"Q \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QRS×AQ \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}QRS×A","key":"iImjRMQl33"},{"type":"text","value":", we can define the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"OAsJ5CMG6c"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"greedy policy","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"AlzqpAPQ7m"}],"key":"JzSikxQfZV"},{"type":"text","value":" ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"qv3o7jyPRz"},{"type":"inlineMath","value":"\\hat \\pi_Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"π^Q\\hat \\pi_Qπ^Q","key":"GDgiG2qrJq"},{"type":"text","value":" as the deterministic policy that selects the action with the highest ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"a3leS1v1dQ"},{"type":"inlineMath","value":"Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QQQ","key":"O9wRO5k55r"},{"type":"text","value":"-value at each state:","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"H0plZ6zkkB"}],"key":"YrrOsizF8M"},{"type":"math","value":"\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}","position":{"start":{"line":459,"column":1},"end":{"line":461,"column":1}},"html":"π^Q(s)=argmaxaQsa\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}π^Q(s)=argamaxQsa","enumerator":"1.11","key":"tVUZTqMa0Y"}],"key":"wEUESvhtLD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_greedy(q: Float[Array, \"S A\"]) -> Float[Array, \"S A\"]:\n \"\"\"\n Get the (deterministic) greedy policy with respect to an action-value function.\n Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.\n \"\"\"\n A = q.shape[1]\n a_ary = jnp.argmax(q, axis=1)\n return jnp.eye(A)[a_ary]\n\n\ndef v_to_greedy(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \"S A\"]:\n \"\"\"Get the (deterministic) greedy policy with respect to a value function.\"\"\"\n return q_to_greedy(v_to_q(mdp, v))","key":"KPZxTFtuPW"},{"type":"output","id":"usD5cW7_ONIlp9iWX1r0f","data":[],"key":"qwA3uybbWJ"}],"data":{},"key":"sD7kEKBlSY"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"text","value":"The one-step (Bellman) consistency equation","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"ABj2y23OR4"}],"identifier":"the-one-step-bellman-consistency-equation","label":"The one-step (Bellman) consistency equation","html_id":"the-one-step-bellman-consistency-equation","implicit":true,"enumerator":"1.2.5","key":"MrN9es6E4J"},{"type":"paragraph","position":{"start":{"line":481,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that by simply considering the cumulative reward as the sum of the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"yhaLxgTKDb"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"tpTzz8Rlqj"}],"key":"z9oBV2euMN"},{"type":"text","value":" reward and the ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"rKPcjXat0K"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"future","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"tLpVflus7C"}],"key":"M8mHrpQMA9"},{"type":"text","value":" cumulative reward, we can describe the\nvalue function recursively (in terms of itself). This is named the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"jBi0Yr3Q37"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"yQ0PSm8rDj"}],"key":"aKNQi9icp9"},{"type":"text","value":" after ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"n1OHdwX6zR"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Richard Bellman","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"pgw8vmwbiu"}],"key":"kpYuXDnD7v"},{"type":"text","value":" (1920--1984),\nwho is credited with introducing dynamic programming in 1953.","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"L5JbwG8c1L"}],"key":"ENXhctd9CG"},{"type":"proof","kind":"theorem","label":"bellman_consistency","identifier":"bellman_consistency","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for the value function","position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"key":"ZgYa1q5L4P"}],"key":"cvHQNlltbP"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":490,"column":1},"end":{"line":492,"column":1}},"html":"Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]","enumerator":"1.12","key":"Yyy6nFU7qX"}],"enumerator":"1.1","html_id":"bellman-consistency","key":"TXjUwbBrJN"}],"key":"dICYnMAHRm"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def check_bellman_consistency_v(\n mdp: MDP,\n policy: Float[Array, \"H S A\"],\n v_ary: Float[Array, \"H S\"],\n) -> bool:\n \"\"\"\n Check that the given (time-dependent) \"value function\"\n satisfies the Bellman consistency equation.\n \"\"\"\n return all(\n jnp.allclose(\n # lhs\n v_ary[h],\n # rhs\n jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),\n )\n for h in range(mdp.H - 1)\n )","key":"DIodALJ0BY"},{"type":"output","id":"JxGaMz-Db2PYuQCCGu7Kd","data":[],"key":"vI8Y3JQnhB"}],"data":{},"key":"ZLNzec954j"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"AalDu5zbE8"}],"key":"UFeyhXV6zh"},{"type":"paragraph","position":{"start":{"line":517,"column":1},"end":{"line":518,"column":1}},"children":[{"type":"text","value":"Verify that this equation holds by expanding ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"USwMRJpDxz"},{"type":"inlineMath","value":"V_\\hi^\\pi(s)","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vhπ(s)V_\\hi^\\pi(s)Vhπ(s)","key":"Z1qQq8h0nc"},{"type":"text","value":"\nand ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"QAAN7f4yme"},{"type":"inlineMath","value":"V_{\\hi+1}^\\pi(s')","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vh+1π(s)V_{\\hi+1}^\\pi(s')Vh+1π(s)","key":"zv0c9iyKtp"},{"type":"text","value":".","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"pniLx3FyO9"}],"key":"kQP08fAUmI"}],"key":"gRyVxNsxO4"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":522,"column":1}},"children":[{"type":"text","value":"One can analogously derive the Bellman consistency equation for the\naction-value function:","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Z9lgte9bgb"}],"key":"sJpHsZ7vks"},{"type":"proof","kind":"theorem","label":"bellman_consistency_action","identifier":"bellman_consistency_action","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for action-values","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"fGoz3ZZ6qL"}],"key":"ixR2Mrl5bv"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]","enumerator":"1.13","key":"Nqfs2ZmNiS"}],"enumerator":"1.2","html_id":"bellman-consistency-action","key":"lgE4dUlYnY"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"VpdJjj902h"}],"key":"Xmdh6OD9DG"},{"type":"paragraph","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"children":[{"type":"text","value":"Write a ","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"btGeoGpNKI"},{"type":"inlineCode","value":"check_bellman_consistency_q","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"BaVlX6snbg"},{"type":"text","value":" function for the action-value function.","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"hy1gMduxhM"}],"key":"M0PV2TFC5C"}],"key":"WfdrPyzNoS"},{"type":"proof","kind":"remark","label":"bellman_det","identifier":"bellman_det","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman consistency equation for deterministic policies","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"ieBxUnp5zo"}],"key":"n4OP8geGuK"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Note that for deterministic policies, the Bellman consistency equation\nsimplifies to","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"LAXggzvSqg"}],"key":"trorOGS4yt"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}","position":{"start":{"line":540,"column":1},"end":{"line":545,"column":1}},"html":"Vhπ(s)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]Qhπ(s,a)=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}Vhπ(s)Qhπ(s,a)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]","enumerator":"1.14","key":"rTYdbJe6mo"}],"enumerator":"1.1","html_id":"bellman-det","key":"P157kF0o8P"}],"key":"H7KjBEJgWk"},{"type":"block","position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"The one-step Bellman operator","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"lnxEXW1dDv"}],"identifier":"the-one-step-bellman-operator","label":"The one-step Bellman operator","html_id":"the-one-step-bellman-operator","implicit":true,"enumerator":"1.2.6","key":"FtwrR1WaVB"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":554,"column":1}},"children":[{"type":"text","value":"Fix a policy ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"wQsu3o80PE"},{"type":"text","value":"π","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"YYma6jXSYy"},{"type":"text","value":". Consider the higher-order operator that takes in a\n“value function” ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"sAu6ilWDi1"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"jlNro583DD"},{"type":"text","value":" and returns the r.h.s. of the Bellman\nequation for that “value function”:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"NeIJ0tEkRL"}],"key":"xbMWIjGlPv"},{"type":"proof","kind":"definition","label":"bellman_operator","identifier":"bellman_operator","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":556,"column":1},"end":{"line":556,"column":1}},"key":"dECMhSaiUn"}],"key":"iRTJob5o3k"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].","enumerator":"1.15","key":"n9YYQLr8BT"},{"type":"paragraph","position":{"start":{"line":561,"column":1},"end":{"line":564,"column":1}},"children":[{"type":"text","value":"This is a crucial tool for reasoning about MDPs.\nIntuitively, it answers the following question:\nif we evaluate the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"QX3MozcBnQ"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"next","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"Sqz4k53fau"}],"key":"hEtp7u2a7v"},{"type":"text","value":" state using ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"ZZ9k14OuXC"},{"type":"inlineMath","value":"v","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"html":"vvv","key":"WAG4L4K0jH"},{"type":"text","value":",\nhow good is the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"xUF2SVqMiL"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"TW2eXnQKo1"}],"key":"yosI0S4j5q"},{"type":"text","value":" state, according to the given policy?","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"bj5fl2VDe0"}],"key":"PNwHBM01JL"}],"enumerator":"1.8","html_id":"bellman-operator","key":"xOHcAXLtKd"}],"key":"ZOW9azrVdd"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator_looping(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Looping definition of the Bellman operator.\n Concise version is below\n \"\"\"\n v_new = jnp.zeros(mdp.S)\n for s in range(mdp.S):\n for a in range(mdp.A):\n for s_next in range(mdp.S):\n v_new[s] += (\n policy[s, a]\n * mdp.P[s, a, s_next]\n * (mdp.r[s, a] + mdp.γ * v[s_next])\n )\n return v_new","visibility":"hide","key":"WCnjENVeNM"},{"type":"output","id":"dyRksKX-inE8Nzasn_pUw","data":[],"visibility":"show","key":"x9d3Gpe1Yi"}],"data":{"tags":[]},"visibility":"show","key":"akPV4sOkGm"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Note that we can concisely implement this using the ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"E6AZMdFo08"},{"type":"inlineCode","value":"q_to_v","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"HvA8qa4yqD"},{"type":"text","value":" and ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"WGzCcRkyVN"},{"type":"inlineCode","value":"v_to_q","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"L8fkK961Zq"},{"type":"text","value":" utilities from above:","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"aAl4mK2GpG"}],"key":"xG4nM1uPt6"}],"key":"KwUvOpU3EU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"For a known finite MDP, the Bellman operator can be exactly evaluated.\"\"\"\n return q_to_v(policy, v_to_q(mdp, v)) # equivalent\n return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)","key":"CZjETtn1ha"},{"type":"output","id":"GtGBn56rqDA_cYubrW3Ss","data":[],"key":"Vmf9aCa0xm"}],"data":{},"key":"LAeZQFBXu9"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":604,"column":1},"end":{"line":608,"column":1}},"children":[{"type":"text","value":"We’ll call ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"erG89WgjLw"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"Jπ:RSRS\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}Jπ:RSRS","key":"SHQIOIAbr1"},{"type":"text","value":" the ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"s263DmPXXo"},{"type":"strong","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"children":[{"type":"text","value":"Bellman\noperator","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"BaUezrmgzZ"}],"key":"pzOuxfdN3I"},{"type":"text","value":" of ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"P4fLaBvMGv"},{"type":"text","value":"π","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"Y940t9CBp7"},{"type":"text","value":".\nNote that it’s defined on any “value function” mapping states to real numbers;\n","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"LbFDb8qFg0"},{"type":"inlineMath","value":"v","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"vvv","key":"j1RFnHI1QC"},{"type":"text","value":" doesn’t have to be a well-defined value function for some policy (hence the lowercase notation).\nThe Bellman operator also gives us a concise way to express ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"ZEnta5L5ow"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"dyOCYKJFL1"},{"type":"text","value":"1.1","key":"N6iQdmCuXO"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"mkqAz0x2TA"},{"type":"text","value":" for the value function:","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"xhx5iLDBWa"}],"key":"MpCWvPWg8r"},{"type":"math","value":"V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)","position":{"start":{"line":610,"column":1},"end":{"line":610,"column":1}},"html":"Vhπ=Jπ(Vh+1π)V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)Vhπ=Jπ(Vh+1π)","enumerator":"1.16","key":"PP99BEzmpl"},{"type":"paragraph","position":{"start":{"line":612,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Intuitively, the output of the Bellman operator, a new “value function”,\nevaluates states as follows: from a given state, take one action\naccording to ","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"GElSQrkjfA"},{"type":"text","value":"π","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"M99Vl3mFsM"},{"type":"text","value":", observe the reward, and then evaluate the next state\nusing the input “value function”.","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"B9b9aYF0bt"}],"key":"Ai7X0186hJ"},{"type":"paragraph","position":{"start":{"line":617,"column":1},"end":{"line":619,"column":1}},"children":[{"type":"text","value":"When we discuss infinite-horizon MDPs, the Bellman operator will turn\nout to be more than just a notational convenience: We’ll use it to\nconstruct algorithms for computing the optimal policy.","position":{"start":{"line":617,"column":1},"end":{"line":617,"column":1}},"key":"vrIgueMOZW"}],"key":"ZK6x6XJ8aq"},{"type":"heading","depth":2,"position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Solving finite-horizon MDPs","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"jLC8DRq8bZ"}],"label":"finite_horizon_mdps","identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","enumerator":"1.3","key":"r063IXIFqm"},{"type":"heading","depth":3,"position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"children":[{"type":"text","value":"Policy evaluation in finite-horizon MDPs","position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"key":"xbEraMTwLK"}],"label":"eval_dp","identifier":"eval_dp","html_id":"eval-dp","enumerator":"1.3.1","key":"mKfMTovyDT"},{"type":"paragraph","position":{"start":{"line":628,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"How can we actually compute the value function of a given policy? This\nis the task of ","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"g75XDMKzqy"},{"type":"strong","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"EY5vZ7bzG6"}],"key":"LHC00FdA3A"},{"type":"text","value":".","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"vFOdYXNtoa"}],"key":"zVujq13ix2"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to evaluate a policy in a finite-horizon MDP","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"k29lvYXu31"}],"key":"ss6Edat9fz"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation\n","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"tQ1DwWL04s"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"OQudG9a4lr"},{"type":"text","value":"1.1","key":"tkGJ74urSP"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"r71XwdwxAy"},{"type":"text","value":"\ngives us a convenient algorithm for\nevaluating stationary policies: it expresses the value function at\ntimestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"z0NZvSUjrR"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h\\hih","key":"ToCHR4ZNot"},{"type":"text","value":" as a function of the value function at timestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"BB2RxIhPrB"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h+1\\hi+1h+1","key":"eBNTPd7G0n"},{"type":"text","value":". This\nmeans we can start at the end of the time horizon, where the value is\nknown, and work backwards in time, using the Bellman consistency\nequation to compute the value function at each time step.","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"t9ohnr16Di"}],"key":"iOi109NtYY"}],"enumerator":"1.9","key":"f7qxJwpYbn"}],"key":"GACO3nzuiN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def dp_eval_finite(mdp: MDP, policy: Float[Array, \"S A\"]) -> Float[Array, \"H S\"]:\n \"\"\"Evaluate a policy using dynamic programming.\"\"\"\n V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n for h in range(mdp.H - 1, -1, -1):\n V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])\n return jnp.stack(V_ary[:-1])","key":"axBUcT44ur"},{"type":"output","id":"m2KQvip3tffMMmN6xvU6R","data":[],"key":"osKhzHDQvQ"}],"data":{},"key":"CVTzpiJ0Rt"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"This runs in time ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"EDH5gS7rPm"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"ewo9lwL48J"},{"type":"text","value":" by counting the\nloops.","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"abujKKNucA"}],"key":"Z1EmBHsoM9"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"ElonKmFFCB"}],"key":"ILQSYh9RPC"},{"type":"paragraph","position":{"start":{"line":656,"column":1},"end":{"line":657,"column":1}},"children":[{"type":"text","value":"Do you see where we compute ","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"HBJnl5lniw"},{"type":"inlineMath","value":"Q^\\pi_\\hi","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"html":"QhπQ^\\pi_\\hiQhπ","key":"nV0ZKUqAPb"},{"type":"text","value":" along the way? Make\nthis step explicit.","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"LmLQusuBAy"}],"key":"uNe1ExCh8E"}],"key":"mGSwl8CWRI"},{"type":"proof","kind":"example","label":"tidy_eval_finite","identifier":"tidy_eval_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":660,"column":1},"end":{"line":660,"column":1}},"key":"VSNk2yFHFW"}],"key":"zCXeAEEieR"},{"type":"paragraph","position":{"start":{"line":663,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"Let’s evaluate the policy from\n","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"SPluul28HE"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"ROsdTl224p"},{"type":"text","value":"1.2","key":"oFnK9rwVUY"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"HdzO6cDrYr"},{"type":"text","value":" in the tidying MDP\nthat tidies if and only if the room is\nmessy. We’ll use the Bellman consistency equation to compute the value\nfunction at each time step.","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"VwcCf1vKbB"}],"key":"LnlSVNQ3sa"},{"type":"math","value":"\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}","position":{"start":{"line":669,"column":1},"end":{"line":690,"column":1}},"html":"VH1π(orderly)=r(orderly,ignore)=1VH1π(messy)=r(messy,tidy)=0VH2π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7VH2π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1VH3π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49VH3π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}VH1π(orderly)VH1π(messy)VH2π(orderly)VH2π(messy)VH3π(orderly)VH3π(messy)=r(orderly,ignore)=1=r(messy,tidy)=0=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7","enumerator":"1.17","key":"tLoXlTBMjR"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":693,"column":1}},"children":[{"type":"text","value":"etc. You may wish to repeat this computation for the\nother policies to get a better sense of this algorithm.","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"rR3XMWPf97"}],"key":"QoWxCeM8QJ"}],"enumerator":"1.4","html_id":"tidy-eval-finite","key":"ZbCpBIGOlL"}],"key":"L7DWvQ4byX"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)\nV_messy","key":"BrVc9RnBd2"},{"type":"output","id":"kI3PLAXow4GA4KExpalHJ","data":[{"output_type":"execute_result","execution_count":14,"metadata":{},"data":{"text/plain":{"content":"Array([[5.5621696, 4.7927704],\n [4.7927704, 4.0241003],\n [4.0241003, 3.253 ],\n [3.253 , 2.49 ],\n [2.49 , 1.7 ],\n [1.7 , 1. ],\n [1. , 0. ]], dtype=float32)","content_type":"text/plain"}}}],"key":"tnwKO6BoAH"}],"data":{},"key":"CHBdElOjSz"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"Optimal policies in finite-horizon MDPs","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"dAg9Udr3UV"}],"label":"opt_dynamic_programming","identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","enumerator":"1.3.2","key":"w5ISHyaMhv"},{"type":"paragraph","position":{"start":{"line":704,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"We’ve just seen how to ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"Nnaz1KEdxD"},{"type":"emphasis","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"kpgtv4pOgz"}],"key":"KZ9xULEe4t"},{"type":"text","value":" a given policy. But how can we find\nthe ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"vwG5G3cfWB"},{"type":"strong","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"uDVCtWqIUJ"}],"key":"hNrXpKtUAi"},{"type":"text","value":" for a given environment?","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"td9KsWC7JZ"}],"key":"j4MeJfa4hE"},{"type":"proof","kind":"definition","label":"optimal_policy_finite","identifier":"optimal_policy_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policies","position":{"start":{"line":707,"column":1},"end":{"line":707,"column":1}},"key":"tNLU58Inzo"}],"key":"LCYQof8xAY"},{"type":"paragraph","position":{"start":{"line":710,"column":1},"end":{"line":712,"column":1}},"children":[{"type":"text","value":"We call a policy optimal, and denote it by ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"FIytnfkIi1"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"html":"π\\pi^\\starπ","key":"rj0z0YgaZg"},{"type":"text","value":", if it does at\nleast as well as ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"UuGIl6nhAu"},{"type":"emphasis","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"szxbLpXEA0"}],"key":"XvVfabqhNQ"},{"type":"text","value":" other policy ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"KOH8gcufYy"},{"type":"text","value":"π","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"gkzCNsbUhu"},{"type":"text","value":" (including stochastic and\nhistory-dependent ones) in all situations:","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"GGJLnQVjZb"}],"key":"oG1Ry9dCTI"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}","position":{"start":{"line":714,"column":1},"end":{"line":719,"column":1}},"html":"Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]","enumerator":"1.18","key":"pbRAQsj3uN"},{"type":"paragraph","position":{"start":{"line":721,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"where we condition on the\ntrajectory up to time ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"n9Ou97KIzA"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"h\\hih","key":"WnynMGDGmD"},{"type":"text","value":", denoted\n","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"B6oY0dAOd1"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"UBTDuwjefZ"},{"type":"text","value":", where ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"EAqJo52mVZ"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"Xw4SzejCdm"},{"type":"text","value":".","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"C29csi7vbX"}],"key":"Sc75OoToWu"}],"enumerator":"1.10","html_id":"optimal-policy-finite","key":"wcRF1F6vP0"},{"type":"paragraph","position":{"start":{"line":726,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"Convince yourself that all optimal policies must have the same value\nfunction. We call this the ","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"aMRvsbkvz7"},{"type":"strong","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"KHmKOwJziy"}],"key":"csjZ3ZGwlm"},{"type":"text","value":" and denote it by\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"FpVpRAeXmS"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"cowglaVijm"},{"type":"text","value":". The same goes for the action-value function\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"NRP5kNmlYe"},{"type":"inlineMath","value":"Q_\\hi^\\star(s, a)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Qh(s,a)Q_\\hi^\\star(s, a)Qh(s,a)","key":"oGL27m9o47"},{"type":"text","value":".","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"newB15MeqQ"}],"key":"yjbcIxRPKE"},{"type":"paragraph","position":{"start":{"line":731,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"It is a stunning fact that ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"mK11eB5Oh4"},{"type":"strong","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"every finite-horizon MDP has an optimal\npolicy that is time-dependent and deterministic.","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"jBPzm9OGWz"}],"key":"PffCaoQ8e9"},{"type":"text","value":" In particular, we can\nconstruct such a policy by acting ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"YHVE4WFDJ8"},{"type":"emphasis","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"greedily","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"r83bYLYiuQ"}],"key":"Ms167YLTUH"},{"type":"text","value":" with respect to the optimal\naction-value function:","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"hr0mSXULSx"}],"key":"jBoniYihHh"},{"type":"proof","kind":"theorem","label":"optimal_greedy","identifier":"optimal_greedy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"It is optimal to be greedy with respect to the optimal value function","position":{"start":{"line":737,"column":1},"end":{"line":737,"column":1}},"key":"CnpSWLOoan"}],"key":"O7BePHm94q"},{"type":"math","value":"\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).","position":{"start":{"line":740,"column":1},"end":{"line":740,"column":1}},"html":"πh(s)=argmaxaQh(s,a).\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).πh(s)=argamaxQh(s,a).","enumerator":"1.19","key":"cdsQmiTxI8"}],"enumerator":"1.3","html_id":"optimal-greedy","key":"b7plR2mkMA"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"m8ytANnGeh"}],"key":"YTexVYkXph"},{"type":"paragraph","position":{"start":{"line":744,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"CDCOxwjUvB"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"VV^{\\star}V","key":"U48cvspJ4t"},{"type":"text","value":" and ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"Gn0xYLcH2t"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"QQ^{\\star}Q","key":"Vma8QVIPfg"},{"type":"text","value":" denote the optimal value and\naction-value functions. Consider the greedy policy","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"UWXilmVmTw"}],"key":"SN2PWV2M03"},{"type":"math","value":"\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"html":"π^h(s):=argmaxaQh(s,a).\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).π^h(s):=argamaxQh(s,a).","enumerator":"1.20","key":"tnRCn4IT6b"},{"type":"paragraph","position":{"start":{"line":749,"column":1},"end":{"line":750,"column":1}},"children":[{"type":"text","value":"We aim to show that\n","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"H7vbtNE6Y0"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"π^\\hat \\piπ^","key":"ai7HV5ONoN"},{"type":"text","value":" is optimal; that is, ","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"SVLXTDGMd9"},{"type":"inlineMath","value":"V^{\\hat \\pi} = V^{\\star}","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"Vπ^=VV^{\\hat \\pi} = V^{\\star}Vπ^=V","key":"xVP6w8k0pe"},{"type":"text","value":".","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"ZNxjaHVa67"}],"key":"pH6V0rv2S4"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"children":[{"type":"text","value":"Fix an arbitrary state ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"lfgvFF8qJo"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"pGV1ZonF4B"},{"type":"text","value":" and time ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"NQsrzgzeNv"},{"type":"inlineMath","value":"\\hi \\in [H]","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"h[H]\\hi \\in [H]h[H]","key":"GwM4A1GQ6K"},{"type":"text","value":".","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"jklhJyeStZ"}],"key":"eosttjmiie"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":759,"column":1}},"children":[{"type":"text","value":"Firstly, by the definition of ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"ZM5OBlo0z4"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VV^{\\star}V","key":"Kzf82Y77bl"},{"type":"text","value":", we already know\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"NSONKJMyVV"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"rySjunCmQt"},{"type":"text","value":". So for equality to hold we just\nneed to show that ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"C4sgVYePWS"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"Ljr1YlDclp"},{"type":"text","value":". We’ll first\nshow that the Bellman operator ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"WyIOc09Pfs"},{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"pWlKcs9o0f"},{"type":"text","value":" never decreases\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"UsTPbNUyt9"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"ynbhQxleyO"},{"type":"text","value":". Then we’ll apply this result recursively to show that\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"Us9c7EkZDB"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"RBSfJPW7Kx"},{"type":"text","value":".","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"IurokSIZ1f"}],"key":"ivYdTdDqpS"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator never decreases the optimal value function","position":{"start":{"line":761,"column":1},"end":{"line":761,"column":1}},"key":"W69gViOw8i"}],"key":"jO6gpcZ12M"},{"type":"paragraph","position":{"start":{"line":762,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"BWPPFoGd5x"},{"type":"text","value":" never decreases ","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"PDtUcdKyOL"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"Jbq0U94gwB"},{"type":"text","value":"\n(elementwise):","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"P76gy5o4M3"}],"key":"SU6govKDv4"},{"type":"math","value":"[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"[Jπ^(Vh+1)](s)Vh(s).[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).[Jπ^(Vh+1)](s)Vh(s).","enumerator":"1.21","key":"xyQLRcM1a5"},{"type":"paragraph","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"strong","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"Proof:","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"lVE8tovmk7"}],"key":"oKq8u2Ama9"}],"key":"Y1YkwLqGRG"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}","position":{"start":{"line":769,"column":1},"end":{"line":777,"column":1}},"html":"Vh(s)=maxπΠVhπ(s)=maxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]Bellman consistencymaxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]definition of V=maxa[r(s,a)+EsP(s,a)Vh+1(s)]only depends on π via a=[Jπ^(Vh+1)](s).\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}Vh(s)=πΠmaxVhπ(s)=πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]=amax[r(s,a)+EsP(s,a)Vh+1(s)]=[Jπ^(Vh+1)](s).Bellman consistencydefinition of Vonly depends on π via a","enumerator":"1.22","key":"HLC80W5cU1"},{"type":"paragraph","position":{"start":{"line":779,"column":1},"end":{"line":781,"column":1}},"children":[{"type":"text","value":"Note that the chosen action ","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"szceLOYj8O"},{"type":"inlineMath","value":"a \\sim \\pi(\\dots)","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"html":"aπ()a \\sim \\pi(\\dots)aπ()","key":"UxOAAROhYo"},{"type":"text","value":" above\nmight depend on the past history; this isn’t shown in the notation and\ndoesn’t affect our result (make sure you see why).","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"MP6JUngeBn"}],"key":"T8q0vBnEvc"}],"enumerator":"1.1","key":"HT1BBEnOhc"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"We can now apply this result recursively to get","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"zbLCB1FNGt"}],"key":"stcrXBgn0W"},{"type":"math","value":"V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)","position":{"start":{"line":786,"column":1},"end":{"line":786,"column":1}},"html":"Vt(s)Vtπ^(s)V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)Vt(s)Vtπ^(s)","enumerator":"1.23","key":"sLmcnIuxz1"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"as follows. (Note that even\nthough ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"bpULSu1Q7o"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"π^\\hat \\piπ^","key":"dFtUb3yn0O"},{"type":"text","value":" is deterministic, we’ll use the ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"QvIB9Ohwd5"},{"type":"inlineMath","value":"a \\sim \\hat \\pi(s)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"aπ^(s)a \\sim \\hat \\pi(s)aπ^(s)","key":"ctlbprDsnC"},{"type":"text","value":"\nnotation to make it explicit that we’re sampling a trajectory from it.)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"mUKshY9csf"}],"key":"hJCEhUgePp"},{"type":"math","value":"\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}","position":{"start":{"line":792,"column":1},"end":{"line":802,"column":1}},"html":"Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]definition of Jπ^Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]above lemma=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+EsVt+2(s)]]definition of Jπ^apply at all timesteps=Eτρπ^[Gtsh=s]rewrite expectation=Vtπ^(s)definition\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+Es′′Vt+2(s′′)]]=Eτρπ^[Gtsh=s]=Vtπ^(s)definition of Jπ^above lemmadefinition of Jπ^apply at all timestepsrewrite expectationdefinition","enumerator":"1.24","key":"Ci353Z0frw"},{"type":"paragraph","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"children":[{"type":"text","value":"And so we have ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"CLzI9QufAV"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"K4MrdAWw7B"},{"type":"text","value":", making ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"Sj50lCz0v4"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"π^\\hat \\piπ^","key":"NisRHLPUoE"},{"type":"text","value":" optimal.","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"ae84HbYMMn"}],"key":"bgol1cO81y"}],"enumerator":"1.1","key":"pAdJhHYi2S"},{"type":"paragraph","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Note that this also gives simplified forms of the ","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"fpf9jvNzkg"},{"type":"crossReference","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Bellman consistency","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"n66tzJHu5l"}],"identifier":"bellman_consistency","label":"bellman_consistency","kind":"proof:theorem","template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"RwODzzmYK0"},{"type":"text","value":" equations for the optimal policy:","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"KoUWw8QVF8"}],"key":"pV4UoIbma5"},{"type":"proof","kind":"corollary","label":"bellman_consistency_optimal","identifier":"bellman_consistency_optimal","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equations for the optimal policy","position":{"start":{"line":809,"column":1},"end":{"line":809,"column":1}},"key":"xJakbRgDfW"}],"key":"l1k7iHWWzi"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}","position":{"start":{"line":812,"column":1},"end":{"line":817,"column":1}},"html":"Vh(s)=maxaQh(s,a)Qh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}Vh(s)Qh(s,a)=amaxQh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]","enumerator":"1.25","key":"fDSiqd6y6I"}],"enumerator":"1.1","html_id":"bellman-consistency-optimal","key":"NdFTbkHFM1"},{"type":"paragraph","position":{"start":{"line":820,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Now that we’ve shown this particular greedy policy is optimal, all we\nneed to do is compute the optimal value function and optimal policy. We\ncan do this by working backwards in time using ","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"rcFumtuQET"},{"type":"strong","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"NDzBUv1zJc"}],"key":"hBFzW2kvF2"},{"type":"text","value":"\n(DP).","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"Gs69dgiLfo"}],"key":"DKfRq5fUrm"},{"type":"proof","kind":"definition","label":"pi_star_dp","identifier":"pi_star_dp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to compute an optimal policy in a finite-horizon MDP","position":{"start":{"line":825,"column":1},"end":{"line":825,"column":1}},"key":"Bab68HuCFg"}],"key":"x8ElgOXSSI"},{"type":"paragraph","position":{"start":{"line":828,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"strong","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Base case.","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"jsWn4gwf7f"}],"key":"VNSZVIxG8f"},{"type":"text","value":" At the end of the episode (time step ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"UC1p4lxWJK"},{"type":"inlineMath","value":"H-1","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"H1H-1H1","key":"jdTs7QRJnG"},{"type":"text","value":"), we can’t\ntake any more actions, so the ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"rWSLyhoszm"},{"type":"inlineMath","value":"Q","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"QQQ","key":"InwBmt69vT"},{"type":"text","value":"-function is simply the reward that\nwe obtain:","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"TNd0WSV7Yh"}],"key":"eBmZNxRbVA"},{"type":"math","value":"Q^\\star_{H-1}(s, a) = r(s, a)","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"html":"QH1(s,a)=r(s,a)Q^\\star_{H-1}(s, a) = r(s, a)QH1(s,a)=r(s,a)","enumerator":"1.26","key":"Ki7HkFCXwi"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"so the best thing to do\nis just act greedily and get as much reward as we can!","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"k7LeREDaDC"}],"key":"q3nw2dZ6aT"},{"type":"math","value":"\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"html":"πH1(s)=argmaxaQH1(s,a)\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)πH1(s)=argamaxQH1(s,a)","enumerator":"1.27","key":"V8uPsFv667"},{"type":"paragraph","position":{"start":{"line":839,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"Then\n","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"bvYeIfossM"},{"type":"inlineMath","value":"V^\\star_{H-1}(s)","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"VH1(s)V^\\star_{H-1}(s)VH1(s)","key":"M9vm7rboXx"},{"type":"text","value":", the optimal value of state ","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"sAAQXVMMq6"},{"type":"inlineMath","value":"s","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"sss","key":"FJGgJeBdHg"},{"type":"text","value":" at the end of the\ntrajectory, is simply whatever action gives the most reward.","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"Xyi8A3291v"}],"key":"OBaWth6YKg"},{"type":"math","value":"V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"html":"VH1=maxaQH1(s,a)V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)VH1=amaxQH1(s,a)","enumerator":"1.28","key":"MDDs6uVzgx"},{"type":"paragraph","position":{"start":{"line":845,"column":1},"end":{"line":847,"column":1}},"children":[{"type":"strong","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"children":[{"type":"text","value":"Recursion.","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"xTW9AdVccj"}],"key":"zKbYlSXBGl"},{"type":"text","value":" Then, we can work backwards in time, starting from the\nend, using our consistency equations! i.e. for each\n","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"MMhJRfShcs"},{"type":"inlineMath","value":"t = H-2, \\dots, 0","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"html":"t=H2,,0t = H-2, \\dots, 0t=H2,,0","key":"m3tUgiGcuM"},{"type":"text","value":", we set","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"uygtDvJdRM"}],"key":"yonf8Y3MvL"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}","position":{"start":{"line":849,"column":1},"end":{"line":855,"column":1}},"html":"Qt(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]πt(s)=argmaxaQt(s,a)Vt(s)=maxaQt(s,a)\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}Qt(s,a)πt(s)Vt(s)=r(s,a)+EsP(s,a)[Vh+1(s)]=argamaxQt(s,a)=amaxQt(s,a)","enumerator":"1.29","key":"DFOiQR2OhV"}],"enumerator":"1.11","html_id":"pi-star-dp","key":"gLJK7Ni7Cg"}],"key":"SdiKta1534"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def find_optimal_policy(mdp: MDP):\n Q = [None] * mdp.H\n pi = [None] * mdp.H\n V = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n\n for h in range(mdp.H - 1, -1, -1):\n Q[h] = mdp.r + mdp.P @ V[h + 1]\n pi[h] = jnp.eye(mdp.S)[jnp.argmax(Q[h], axis=1)] # one-hot\n V[h] = jnp.max(Q[h], axis=1)\n\n Q = jnp.stack(Q)\n pi = jnp.stack(pi)\n V = jnp.stack(V[:-1])\n\n return pi, V, Q","key":"S4IjuSJuLh"},{"type":"output","id":"dboccwd4xw87y9dFJU2dl","data":[],"key":"PGnH5XsElu"}],"data":{},"key":"TWOJTWupmP"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":876,"column":1},"end":{"line":879,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"uUNisUltWN"},{"type":"inlineMath","value":"H","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"HHH","key":"Utm5Ls0k78"},{"type":"text","value":" timesteps, we must compute ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"Gtffo7hvjB"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"QQ^{\\star}Q","key":"jkqNEKkT18"},{"type":"text","value":" for each of\nthe ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"kvq8xmYQRA"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"Bz8R8riLVl"},{"type":"text","value":" state-action pairs. Each computation takes ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"d1wVlgcBBi"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"S|\\mathcal{S}|S","key":"EYFWvCCZjW"},{"type":"text","value":"\noperations to evaluate the average value over ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"e9LVwZ63vt"},{"type":"inlineMath","value":"s'","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"ss's","key":"pqUegsyTlh"},{"type":"text","value":". This gives a total\ncomputation time of ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"TaZVPtKQXK"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"t2xwDciTQa"},{"type":"text","value":".","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"oGW72zurhT"}],"key":"DgJ2S1SpE0"},{"type":"paragraph","position":{"start":{"line":881,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"text","value":"Note that this algorithm is identical to the policy evaluation algorithm\n","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"SiP1HhU9r1"},{"type":"crossReference","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"inlineCode","value":"dp_eval_finite","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"ivzCaesKAL"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"DkOW4IAxQX"},{"type":"text","value":", but instead of ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"GzOnsVlCMp"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"averaging","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"imGcuTbUck"}],"key":"g9qTb0TlyH"},{"type":"text","value":" over the\nactions chosen by a policy, we instead simply take a ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"fVuOTLT8gx"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"maximum","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"ScHb8uDzAb"}],"key":"gX2Ao65p4z"},{"type":"text","value":" over the\naction-values. We’ll see this relationship between ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"GcfAxz40G2"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"VlOOnb8aaJ"}],"key":"uOeouPbfnB"},{"type":"text","value":"\nand ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"xOoea3LZr8"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"optimal policy computation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"XEuwoeIkVz"}],"key":"KmksqBAGxg"},{"type":"text","value":" show up again in the infinite-horizon\nsetting.","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"GOoy6NZzZh"}],"key":"V1dg8VGCbv"}],"key":"ZGWSejlbsw"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)\nassert jnp.allclose(π_opt, tidy_policy_messy_only)\nassert jnp.allclose(V_opt, V_messy)\nassert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])\n\"Assertions passed (the 'tidy when messy' policy is optimal)\"","key":"QiziMetcku"},{"type":"output","id":"A8ZM9Be2sA7OuUs-KmPll","data":[{"output_type":"execute_result","execution_count":16,"metadata":{},"data":{"text/plain":{"content":"\"Assertions passed (the 'tidy when messy' policy is optimal)\"","content_type":"text/plain"}}}],"key":"imCT46I43p"}],"data":{},"key":"JjEIyHMuML"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"children":[{"type":"text","value":"Infinite-horizon MDPs","position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"key":"txsVqExX9Y"}],"label":"infinite_horizon_mdps","identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","enumerator":"1.4","key":"IPg8fJt8Ym"},{"type":"paragraph","position":{"start":{"line":899,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"What happens if a trajectory is allowed to continue forever (i.e.\n","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"Ziw9dduHWZ"},{"type":"inlineMath","value":"H = \\infty","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"html":"H=H = \\inftyH=","key":"CWRJ00RLZ2"},{"type":"text","value":")? This is the setting of ","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"y5TpCpkcCA"},{"type":"strong","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"children":[{"type":"text","value":"infinite horizon","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"i5Xj3yVgGu"}],"key":"M6vdUQL6Em"},{"type":"text","value":" MDPs.","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"IsQ1qONNtd"}],"key":"w1o3Txu5u8"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"In this chapter, we’ll describe the necessary adjustments from the\nfinite-horizon case to make the problem tractable. We’ll show that the\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"I4CSEQeMiA"},{"type":"crossReference","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ownnTzCsmc"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"zjfZLvJnHL"},{"type":"text","value":" in the discounted reward setting is a\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"pF2pfooBjV"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"DnejkE8k8U"}],"key":"CnkC4mBAZU"},{"type":"text","value":" for any policy.\nWe’ll discuss how to evaluate\npolicies (i.e. compute their corresponding value functions). Finally,\nwe’ll present and analyze two iterative algorithms, based on the Bellman\noperator, for computing the optimal policy: ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"vufibmxORJ"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ON4YRvD8IT"}],"key":"OsYi4M9poi"},{"type":"text","value":" and\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"uTitzODm5Y"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ED92zNNiCz"}],"key":"GvBJPQftbu"},{"type":"text","value":".","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"y8eREUg3Sh"}],"key":"RUYJaNcNu6"},{"type":"heading","depth":3,"position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"children":[{"type":"text","value":"Discounted rewards","position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"key":"uPuwtuxd3m"}],"identifier":"discounted-rewards","label":"Discounted rewards","html_id":"discounted-rewards","implicit":true,"enumerator":"1.4.1","key":"s1sG5pQcbT"},{"type":"paragraph","position":{"start":{"line":914,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"First of all, note that maximizing the cumulative reward\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"OhcTirdjW8"},{"type":"inlineMath","value":"r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdots","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"rh+rh+1+rh+2+r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdotsrh+rh+1+rh+2+","key":"SYgx33VXd0"},{"type":"text","value":" is no longer a good idea since it\nmight blow up to infinity. Instead of a time horizon ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"keDE7CYfaC"},{"type":"inlineMath","value":"H","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"HHH","key":"Tgoez4soHp"},{"type":"text","value":", we now need a\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"mz9urzqGOk"},{"type":"strong","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"children":[{"type":"text","value":"discount factor","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"wYFWNMm6V7"}],"key":"ZX9pr7xXLT"},{"type":"text","value":" ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"eowdrtaR09"},{"type":"inlineMath","value":"\\gamma \\in [0, 1)","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"γ[0,1)\\gamma \\in [0, 1)γ[0,1)","key":"ClFdOj8taP"},{"type":"text","value":" such that rewards become less\nvaluable the further into the future they are:","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"bBngRauWeN"}],"key":"RREdXKDPgo"},{"type":"math","value":"r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.","position":{"start":{"line":920,"column":1},"end":{"line":920,"column":1}},"html":"rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.rh+γrh+1+γ2rh+2+=k=0γkrh+k.","enumerator":"1.30","key":"BQQbi9AMbm"},{"type":"paragraph","position":{"start":{"line":922,"column":1},"end":{"line":924,"column":1}},"children":[{"type":"text","value":"We can think of ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"Il3FEK7rr6"},{"type":"text","value":"γ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"mVnsYeP0DJ"},{"type":"text","value":" as measuring how much we care about the future:\nif it’s close to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"ASXrSXIyGi"},{"type":"text","value":"0","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"ILmm5Jn9Jf"},{"type":"text","value":", we only care about the near-term rewards; it’s\nclose to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"DIqWayC0DX"},{"type":"text","value":"1","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"wiWZpj9FHb"},{"type":"text","value":", we put more weight into future rewards.","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"CFmx2Lf0qD"}],"key":"TGRqk4cGx5"},{"type":"paragraph","position":{"start":{"line":926,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"You can also analyze ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"sD8CXydl7f"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"HB1adaeYBl"},{"type":"text","value":" as the probability of ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"Ga02JTB4wY"},{"type":"emphasis","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"children":[{"type":"text","value":"continuing","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"rBzXJLqNVX"}],"key":"HtvZVstIJC"},{"type":"text","value":" the\ntrajectory at each time step. (This is equivalent to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"n4zwFX83cT"},{"type":"inlineMath","value":"H","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"html":"HHH","key":"SgCnS5O2sB"},{"type":"text","value":" being\ndistributed by a First Success distribution with success probability\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"u24lHGds4R"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"EATRR0syhj"},{"type":"text","value":".) This accords with the above interpretation: if ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"v40ba5peEK"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"Bxm5W3PZ4M"},{"type":"text","value":" is\nclose to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"r3qvHGbGoP"},{"type":"text","value":"0","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"jVMzjUQ90t"},{"type":"text","value":", the trajectory will likely be very short, while if\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"aX5xOZmTyp"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"NSmUfvyUMk"},{"type":"text","value":" is close to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"qVuoExjngn"},{"type":"text","value":"1","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"AuVK9Iuh40"},{"type":"text","value":", the trajectory will likely continue for a long\ntime.","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"SzbqzgIvV0"}],"key":"c8yTqS1Rbi"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"UWVP512Bqk"}],"key":"WyGqYHtDTW"},{"type":"paragraph","position":{"start":{"line":935,"column":1},"end":{"line":937,"column":1}},"children":[{"type":"text","value":"Assuming that ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"WgtqqnZ279"},{"type":"inlineMath","value":"r_\\hi \\in [0, 1]","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"rh[0,1]r_\\hi \\in [0, 1]rh[0,1]","key":"xmoTqd8jbM"},{"type":"text","value":" for all ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"gsRBLYUHPt"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"GSY5BsdFsf"},{"type":"text","value":",\nwhat is the maximum ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"xz3HQxVVGq"},{"type":"strong","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"children":[{"type":"text","value":"discounted","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"kLUiEwrkN9"}],"key":"t9FIqi1ksc"},{"type":"text","value":" cumulative reward? You may find it\nuseful to review geometric series.","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"eiVSFQ1E7t"}],"key":"eebdfKnETX"}],"key":"PMdkaLng4v"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"The other components of the MDP remain the same:","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"pZtnHv9Joi"}],"key":"HAEJBYPFsH"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"html":"M=(S,A,μ,P,r,γ).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).M=(S,A,μ,P,r,γ).","enumerator":"1.31","key":"h5Wx1MLh1H"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"Code-wise, we can reuse the ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"UD8eBnM8Xa"},{"type":"inlineCode","value":"MDP","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"N33b6RvLFm"},{"type":"text","value":" class from before ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"QcNawkzAXP"},{"type":"crossReference","kind":"proof:definition","identifier":"finite_horizon_mdp","label":"finite_horizon_mdp","children":[{"type":"text","value":"Definition ","key":"dosflvOnMx"},{"type":"text","value":"1.2","key":"kbWveCrZjc"}],"template":"Definition %s","enumerator":"1.2","resolved":true,"html_id":"finite-horizon-mdp","key":"VV8jLhTKlm"},{"type":"text","value":" and set ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"mgqg8qV9so"},{"type":"inlineCode","value":"mdp.H = float('inf')","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"CmYRFvQqPP"},{"type":"text","value":".","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"ThyXyLqA54"}],"key":"ofLYkUGAtp"}],"key":"HKGIBIztIb"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp_inf = tidy_mdp._replace(H=float(\"inf\"), γ=0.95)","key":"j5uwuflTGL"},{"type":"output","id":"ieIueWCVK0DtKkyT9sQDR","data":[],"key":"geuVqBWW9D"}],"data":{},"key":"and4MQFUOM"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"children":[{"type":"text","value":"Stationary policies","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"wrnGRMEyvJ"}],"identifier":"stationary-policies","label":"Stationary policies","html_id":"stationary-policies","implicit":true,"enumerator":"1.4.2","key":"QGvCpsW1pX"},{"type":"paragraph","position":{"start":{"line":952,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"text","value":"The time-dependent policies from the finite-horizon case become\ndifficult to handle in the infinite-horizon case. In particular, many of\nthe DP approaches we saw required us to start at the end of the\ntrajectory, which is no longer possible. We’ll shift to ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"erhqBKVOWr"},{"type":"strong","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"GBfaTse5IA"}],"key":"e6iLhuu2QQ"},{"type":"text","value":"\npolicies ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"hSGIFimWSI"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"π:SA\\pi : \\mathcal{S} \\to \\mathcal{A}π:SA","key":"h1bSYu8Gcm"},{"type":"text","value":" (deterministic) or ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"jBH8mDTqti"},{"type":"inlineMath","value":"\\Delta(\\mathcal{A})","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"Δ(A)\\Delta(\\mathcal{A})Δ(A)","key":"jEUzeINVXG"},{"type":"text","value":" (stochastic).","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"IdlH0tZymD"}],"key":"pyyhdkOYn6"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"kvLnk5iMuN"}],"key":"xG68OWP4xA"},{"type":"paragraph","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Which of the policies in ","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"zMSClrCoJg"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"NEi23jubsO"},{"type":"text","value":"1.2","key":"OFqc27LTXG"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"gPRJhsJTRp"},{"type":"text","value":" are stationary?","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"Fhtb5Gb2Xj"}],"key":"LiiHKNeaDL"}],"key":"E31N9HDqAj"},{"type":"heading","depth":3,"position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"children":[{"type":"text","value":"Value functions and Bellman consistency","position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"key":"w9gCBVptlP"}],"identifier":"value-functions-and-bellman-consistency","label":"Value functions and Bellman consistency","html_id":"value-functions-and-bellman-consistency","implicit":true,"enumerator":"1.4.3","key":"tAt9ETf64P"},{"type":"paragraph","position":{"start":{"line":964,"column":1},"end":{"line":966,"column":1}},"children":[{"type":"text","value":"We also consider stationary value functions ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"SkyOSCeLTa"},{"type":"inlineMath","value":"V^\\pi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Vπ:SRV^\\pi : \\mathcal{S} \\to \\mathbb{R}Vπ:SR","key":"TzzIG0QUaa"},{"type":"text","value":" and\n","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"aMtAh0Cq5l"},{"type":"inlineMath","value":"Q^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Qπ:S×ARQ^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qπ:S×AR","key":"Co0tKSU2CO"},{"type":"text","value":". We need to insert a factor of ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"VuaQcmglxz"},{"type":"text","value":"γ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"Zyrd9J8g0r"},{"type":"text","value":"\ninto the Bellman consistency equation ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"ZliaXCHEgE"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"Qrh6dXSwiB"},{"type":"text","value":"1.1","key":"DU04fCGHsO"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"SURQvN0FOa"},{"type":"text","value":" to account for the discounting:","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"Gqux5X5UI6"}],"key":"AuTBZMPHXl"},{"type":"math","value":"\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}","label":"bellman_consistency_infinite","identifier":"bellman_consistency_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]for any hN=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]for any hN=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}Vπ(s)Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]for any hNfor any hN","enumerator":"1.32","html_id":"bellman-consistency-infinite","key":"swT6b0I7Og"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"EmbBpCHPmT"}],"key":"nLTjzb8iRS"},{"type":"paragraph","position":{"start":{"line":980,"column":1},"end":{"line":981,"column":1}},"children":[{"type":"text","value":"Heuristically speaking, why does it no longer matter which\ntime step we condition on when defining the value function?","position":{"start":{"line":980,"column":1},"end":{"line":980,"column":1}},"key":"Vy8c71a2So"}],"key":"yqvQchcsdS"}],"key":"x4OS4VQGcB"},{"type":"heading","depth":2,"position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"children":[{"type":"text","value":"Solving infinite-horizon MDPs","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"key":"CiZHLhHIeQ"}],"identifier":"solving-infinite-horizon-mdps","label":"Solving infinite-horizon MDPs","html_id":"solving-infinite-horizon-mdps","implicit":true,"enumerator":"1.5","key":"XyigiFqvJN"},{"type":"heading","depth":3,"position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"key":"wiMgo6KOQE"}],"identifier":"the-bellman-operator-is-a-contraction-mapping","label":"The Bellman operator is a contraction mapping","html_id":"the-bellman-operator-is-a-contraction-mapping","implicit":true,"enumerator":"1.5.1","key":"cr7T4xYFP7"},{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Recall from ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"QTTQP1YRgP"},{"type":"crossReference","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"children":[{"type":"text","value":"Definition ","key":"THojxedjeU"},{"type":"text","value":"1.8","key":"RF5UDGlYrP"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"h9pO5tdDzO"},{"type":"text","value":" that the Bellman operator ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"p2nFFLjB7q"},{"type":"inlineMath","value":"\\mathcal{J}^{\\pi}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"Jπ\\mathcal{J}^{\\pi}Jπ","key":"po42ZpGCKI"},{"type":"text","value":"\nfor a policy ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"nAScShAqU3"},{"type":"text","value":"π","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"Jjigx4qWkO"},{"type":"text","value":" takes in a “value function” ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"VWdIwFKei9"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"apBCwCP9Vc"},{"type":"text","value":" and\nreturns the r.h.s. of the Bellman equation for that “value function”. In\nthe infinite-horizon setting, this is","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"rwAopuxGt0"}],"key":"hVkLfsLCqP"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].","enumerator":"1.33","key":"EyoGsLpHou"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"The crucial property of the Bellman operator is that it is a\n","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"CPppWhrxEo"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Gx4EO6iRdJ"}],"key":"x2Gnrk8sds"},{"type":"text","value":" for any policy. Intuitively, if we start with\ntwo “value functions” ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"olQwXmQ0an"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"CygdmGBiL0"},{"type":"text","value":", if we repeatedly apply the\nBellman operator to each of them, they will get closer and closer\ntogether at an exponential rate.","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"pBRbUcynIW"}],"key":"ipvpQWQBIT"},{"type":"proof","kind":"definition","label":"contraction","identifier":"contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contraction mapping","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"WiofhdEG0J"}],"key":"XKhP5ne4xK"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1005,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"H4mg3qjPQn"},{"type":"inlineMath","value":"X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"XXX","key":"ACJsgrq2Et"},{"type":"text","value":" be some space with a norm ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"wYDrjJ0VCl"},{"type":"inlineMath","value":"\\|\\cdot\\|","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"\\|\\cdot\\|","key":"lWs0fuhWoD"},{"type":"text","value":". We call an operator\n","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"L7RAIUPdXa"},{"type":"inlineMath","value":"f: X \\to X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"f:XXf: X \\to Xf:XX","key":"THeetxgXqC"},{"type":"text","value":" a ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"nls0nFpUjf"},{"type":"strong","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"v8LPxq3yoj"}],"key":"yJGMxBZcaf"},{"type":"text","value":" if for any ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"cJNezHz1dP"},{"type":"inlineMath","value":"x, y \\in X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"x,yXx, y \\in Xx,yX","key":"gBIV1S9qSo"},{"type":"text","value":",","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"qd5eLIUurL"}],"key":"fOneFMhzc7"},{"type":"math","value":"\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|","position":{"start":{"line":1007,"column":1},"end":{"line":1007,"column":1}},"html":"f(x)f(y)γxy\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|f(x)f(y)γxy","enumerator":"1.34","key":"OQfpcC9Ds0"},{"type":"paragraph","position":{"start":{"line":1009,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"for some fixed ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"tggrQVF0hv"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"yUzuWPqtVP"},{"type":"text","value":".\nIntuitively, this means that if two points are ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"Oh6CPYqOjP"},{"type":"text","value":"δ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"CVJMslW2B2"},{"type":"text","value":" far apart,\nafter applying the mapping,","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"bG5aGYIKeT"}],"key":"NIZixUY8EJ"}],"enumerator":"1.12","html_id":"contraction","key":"mPfov96t9d"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"iptmcdcFV0"}],"key":"evwdyttaBE"},{"type":"paragraph","position":{"start":{"line":1016,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Show that for a contraction mapping ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"CGRVvuJPol"},{"type":"inlineMath","value":"f","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"fff","key":"fdB2ZPj1mb"},{"type":"text","value":" with coefficient\n","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"j4OAEQfsqy"},{"type":"text","value":"γ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"NzvRhekguR"},{"type":"text","value":", for all ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"IUywX33z2Q"},{"type":"inlineMath","value":"t \\in \\mathbb{N}","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"tNt \\in \\mathbb{N}tN","key":"HRFLyrbWEc"},{"type":"text","value":",","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"CIuujYKNRg"}],"key":"Nu45YeQHOj"},{"type":"math","value":"\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"f(t)(x)f(t)(y)γtxy,\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,f(t)(x)f(t)(y)γtxy,","enumerator":"1.35","key":"cFNx1zOaKH"},{"type":"paragraph","position":{"start":{"line":1021,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"i.e. that any\ntwo points will be pushed closer by at least a factor of ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"Y2WohtaOYU"},{"type":"text","value":"γ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"UfpveUUgL6"},{"type":"text","value":" at\neach iteration.","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"leynOMQMcd"}],"key":"Rb8JmI7PIC"}],"key":"iAiy3Cmnk2"},{"type":"paragraph","position":{"start":{"line":1026,"column":1},"end":{"line":1029,"column":1}},"children":[{"type":"text","value":"It is a powerful fact (known as the ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"yv1xOtbk6N"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"Banach fixed-point theorem","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"F4GT7cNlEK"}],"key":"sTUxA4ijC4"},{"type":"text","value":") that\nevery contraction mapping has a unique ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"v9eWDUDhsn"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"fixed point","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"rikuVyPkPz"}],"key":"jHVXB3OxpU"},{"type":"text","value":" ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"BpnFNAp6bR"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"kwM1iwsSKo"},{"type":"text","value":" such\nthat ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"lsftPzLfF8"},{"type":"inlineMath","value":"f(x^\\star) = x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"f(x)=xf(x^\\star) = x^\\starf(x)=x","key":"IvE71uaWRA"},{"type":"text","value":". This means that if we repeatedly apply ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"EtdVVz6KVm"},{"type":"inlineMath","value":"f","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"fff","key":"Z7vsgq6Mzo"},{"type":"text","value":"\nto any starting point, we will eventually converge to ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"fnldjlphnw"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"bmZjnBV9qb"},{"type":"text","value":":","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"SaN9PHj2Ql"}],"key":"Vd4TPwXuPt"},{"type":"math","value":"\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.","label":"contraction_convergence","identifier":"contraction_convergence","html":"f(t)(x)xγtxx.\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.f(t)(x)xγtxx∥.","enumerator":"1.36","html_id":"contraction-convergence","key":"Ro08MkW4Rq"},{"type":"paragraph","position":{"start":{"line":1037,"column":1},"end":{"line":1040,"column":1}},"children":[{"type":"text","value":"Let’s return to the RL setting and apply this result to the Bellman\noperator. How can we measure the distance between two “value functions”\n","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"ockcGQwcIR"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"Io3Djja5x7"},{"type":"text","value":"? We’ll take the ","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"UAvJQotecT"},{"type":"strong","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"children":[{"type":"text","value":"supremum norm","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"WmqOsl1g45"}],"key":"qhve5DDdzH"},{"type":"text","value":" as our distance\nmetric:","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"zMBznxXP4V"}],"key":"JnpOjXo83m"},{"type":"math","value":"\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,","position":{"start":{"line":1042,"column":1},"end":{"line":1042,"column":1}},"html":"vu:=supsSv(s)u(s),\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,vu:=sSsupv(s)u(s),","enumerator":"1.37","key":"rAjh5QYYfN"},{"type":"paragraph","position":{"start":{"line":1044,"column":1},"end":{"line":1048,"column":1}},"children":[{"type":"text","value":"i.e.\nwe compare the “value functions” on the state that causes the biggest\ngap between them. Then ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"CXIXi9EBy0"},{"type":"crossReference","kind":"equation","identifier":"contraction_convergence","label":"contraction_convergence","children":[{"type":"text","value":"(","key":"KmvmYrS2Wd"},{"type":"text","value":"1.36","key":"Uro6dcJv0a"},{"type":"text","value":")","key":"MTzNM3GZ75"}],"template":"(%s)","enumerator":"1.36","resolved":true,"html_id":"contraction-convergence","key":"KLFImq48Zs"},{"type":"text","value":" implies that if we repeatedly\napply ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"a3Si0x3HPj"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"Jπ\\mathcal{J}^\\piJπ","key":"MUYtbDzeql"},{"type":"text","value":" to any starting “value function”, we will eventually\nconverge to ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"zXTYRlJ7C7"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"VπV^\\piVπ","key":"edoMZW9MP5"},{"type":"text","value":":","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"iXNKQpGAAA"}],"key":"wqETSU0iUD"},{"type":"math","value":"\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.","label":"bellman_convergence","identifier":"bellman_convergence","html":"(Jπ)(t)(v)VπγtvVπ.\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.(Jπ)(t)(v)VπγtvVπ.","enumerator":"1.38","html_id":"bellman-convergence","key":"GTCWHp5wXP"},{"type":"paragraph","position":{"start":{"line":1056,"column":1},"end":{"line":1057,"column":1}},"children":[{"type":"text","value":"We’ll use this useful fact to prove the convergence of several\nalgorithms later on.","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"KoRYEuvTrb"}],"key":"Iuc1cRXGYF"},{"type":"proof","kind":"theorem","label":"bellman_contraction","identifier":"bellman_contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":1059,"column":1},"end":{"line":1059,"column":1}},"key":"moDs11uTrY"}],"key":"dYSZSY1i2f"},{"type":"math","value":"\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"Jπ(v)Jπ(u)γvu.\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.Jπ(v)Jπ(u)γvu.","enumerator":"1.39","key":"ulvJoGyNrp"}],"enumerator":"1.4","html_id":"bellman-contraction","key":"QehsBfOJdz"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof of ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"KHed2TJ8ov"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"gGUInGimn7"},{"type":"text","value":"1.4","key":"l4vGVAtYGI"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"tEvjLVxT9q"}],"key":"tNpD8Wx3h9"},{"type":"paragraph","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"children":[{"type":"text","value":"For all states ","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"WxOHe3HIwe"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"jtBAQ5QL72"},{"type":"text","value":",","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"Il6E0j2LIu"}],"key":"cHPDWuWolw"},{"type":"math","value":"\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1071,"column":1},"end":{"line":1080,"column":1}},"html":"[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γmaxsv(s)u(s)=γvu.\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γsmaxv(s)u(s)=γvu.","enumerator":"1.40","key":"YfD9SfQA0e"}],"enumerator":"1.2","key":"HTfOL3E4mR"},{"type":"heading","depth":3,"position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"children":[{"type":"text","value":"Policy evaluation in infinite-horizon MDPs","position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"key":"C8J756gOxG"}],"identifier":"policy-evaluation-in-infinite-horizon-mdps","label":"Policy evaluation in infinite-horizon MDPs","html_id":"policy-evaluation-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.2","key":"Co9NA0vAxk"},{"type":"paragraph","position":{"start":{"line":1085,"column":1},"end":{"line":1087,"column":1}},"children":[{"type":"text","value":"The backwards DP technique we used in ","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"Z1lK0mSLqK"},{"type":"crossReference","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"children":[{"type":"text","value":"the finite-horizon case","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"pW2SonWUqt"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"hcj58GhtRU"},{"type":"text","value":" no\nlonger works since there is no “final timestep” to start from. We’ll\nneed another approach to policy evaluation.","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"dlxWo21N6Y"}],"key":"s0BbdsJr9X"},{"type":"paragraph","position":{"start":{"line":1089,"column":1},"end":{"line":1092,"column":1}},"children":[{"type":"text","value":"The Bellman consistency conditions yield a system of equations we can\nsolve to evaluate a deterministic policy ","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"EzjHAu2CxB"},{"type":"emphasis","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"YSWoKWi4A8"}],"key":"sS6yShvSyJ"},{"type":"text","value":". For a faster approximate solution,\nwe can iterate the policy’s Bellman operator, since we know that it has\na unique fixed point at the true value function.","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"nQR95ACcSb"}],"key":"S38INlp3IN"},{"type":"heading","depth":4,"position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"children":[{"type":"text","value":"Matrix inversion for deterministic policies","position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"key":"KktIXDnAZP"}],"identifier":"matrix-inversion-for-deterministic-policies","label":"Matrix inversion for deterministic policies","html_id":"matrix-inversion-for-deterministic-policies","implicit":true,"enumerator":"1.5.2.1","key":"Kq6JI3wwso"},{"type":"paragraph","position":{"start":{"line":1096,"column":1},"end":{"line":1098,"column":1}},"children":[{"type":"text","value":"Note that when the policy ","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"JVMysCgtlE"},{"type":"text","value":"π","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"YADNxMYKY1"},{"type":"text","value":" is deterministic, the actions can be\ndetermined from the states, and so we can chop off the action dimension\nfor the rewards and state transitions:","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"afLZaxlLnV"}],"key":"tXit5AEMXr"},{"type":"math","value":"\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}","position":{"start":{"line":1100,"column":1},"end":{"line":1105,"column":1}},"html":"rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}rππRSASPπVπ[0,1]S×SRSμQπ[0,1]SRS×A.","enumerator":"1.41","key":"mKxzuJX4uD"},{"type":"paragraph","position":{"start":{"line":1107,"column":1},"end":{"line":1109,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"HcjgdtkQko"},{"type":"inlineMath","value":"P^\\pi","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"PπP^\\piPπ","key":"gTlhzGkbiv"},{"type":"text","value":", we’ll treat the rows as the states and the\ncolumns as the next states. Then ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"UxHEWfbE66"},{"type":"inlineMath","value":"P^\\pi_{s, s'}","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"Ps,sπP^\\pi_{s, s'}Ps,sπ","key":"zEFAeinv72"},{"type":"text","value":" is the probability of\ntransitioning from state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"iTUrXovAue"},{"type":"inlineMath","value":"s","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"sss","key":"WpsmWF4ote"},{"type":"text","value":" to state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"TMpCcoCeg4"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"ss's","key":"qkxhtCqUGB"},{"type":"text","value":" under policy ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"BacgOl7dXb"},{"type":"text","value":"π","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"sOMACHeyPt"},{"type":"text","value":".","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"YHMVCvlBVK"}],"key":"OdywGqbc4O"},{"type":"proof","kind":"example","label":"tidy_tabular","identifier":"tidy_tabular","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":1111,"column":1},"end":{"line":1111,"column":1}},"key":"XMoPvyW9aA"}],"key":"AcGfRJCNgv"},{"type":"paragraph","position":{"start":{"line":1114,"column":1},"end":{"line":1116,"column":1}},"children":[{"type":"text","value":"The tabular MDP from before has ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"uhwe7f1FhB"},{"type":"inlineMath","value":"|\\mathcal{S}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"S=2|\\mathcal{S}| = 2S=2","key":"dgCXV9NGCY"},{"type":"text","value":" and ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"iq4heP4j9w"},{"type":"inlineMath","value":"|\\mathcal{A}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"A=2|\\mathcal{A}| = 2A=2","key":"Jpb4W0JynO"},{"type":"text","value":". Let’s write\ndown the quantities for the policy ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"fwUt82LvAN"},{"type":"text","value":"π","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"jR8g74nd8G"},{"type":"text","value":" that tidies if and only if the\nroom is messy:","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"yMdy52DrVQ"}],"key":"iedJCe3xHz"},{"type":"math","value":"r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}","position":{"start":{"line":1118,"column":1},"end":{"line":1120,"column":1}},"html":"rπ=[10],Pπ=[0.70.310],μ=[10]r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}rπ=[10],Pπ=[0.710.30],μ=[10]","enumerator":"1.42","key":"M8HpQzzq8d"},{"type":"paragraph","position":{"start":{"line":1122,"column":1},"end":{"line":1123,"column":1}},"children":[{"type":"text","value":"We’ll see how to\nevaluate this policy in the next section.","position":{"start":{"line":1122,"column":1},"end":{"line":1122,"column":1}},"key":"MX1fpzUf4Q"}],"key":"IQPdLS7Ri5"}],"enumerator":"1.5","html_id":"tidy-tabular","key":"Qv8iBIQj7h"},{"type":"paragraph","position":{"start":{"line":1126,"column":1},"end":{"line":1127,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation for a deterministic policy can be\nwritten in tabular notation as","position":{"start":{"line":1126,"column":1},"end":{"line":1126,"column":1}},"key":"nM26lL13iz"}],"key":"eOmietn7AX"},{"type":"math","value":"V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.","position":{"start":{"line":1129,"column":1},"end":{"line":1129,"column":1}},"html":"Vπ=rπ+γPπVπ.V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.Vπ=rπ+γPπVπ.","enumerator":"1.43","key":"a6Bf4iFZnA"},{"type":"paragraph","position":{"start":{"line":1131,"column":1},"end":{"line":1133,"column":1}},"children":[{"type":"text","value":"(Unfortunately, this notation doesn’t simplify the expression for\n","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"nCiLiACPc1"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"html":"QπQ^\\piQπ","key":"V0FzlbBNz8"},{"type":"text","value":".) This system of equations can be solved with a matrix\ninversion:","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"pVsKtpXOcV"}],"key":"QAAGkeoZUv"},{"type":"math","value":"V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.","label":"matrix_inversion_pe","identifier":"matrix_inversion_pe","html":"Vπ=(IγPπ)1rπ.V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.Vπ=(IγPπ)1rπ.","enumerator":"1.44","html_id":"matrix-inversion-pe","key":"IVnZCVAKNZ"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"yLaGVmJ389"}],"key":"ockRNuOtek"},{"type":"paragraph","position":{"start":{"line":1142,"column":1},"end":{"line":1143,"column":1}},"children":[{"type":"text","value":"Note we’ve assumed that ","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"DQquqxIPTm"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"pO4Pt5c9Mu"},{"type":"text","value":" is invertible. Can you see\nwhy this is the case?","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"gRijrtGEkK"}],"key":"Af0I80QW95"},{"type":"paragraph","position":{"start":{"line":1145,"column":1},"end":{"line":1149,"column":1}},"children":[{"type":"text","value":"(Recall that a linear operator, i.e. a square matrix, is invertible if\nand only if its null space is trivial; that is, it doesn’t map any\nnonzero vector to zero. In this case, we can see that ","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"xxeyhgyp9P"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"JKDWPFZKSU"},{"type":"text","value":"\nis invertible because it maps any nonzero vector to a vector with at\nleast one nonzero element.)","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"tasLd218aG"}],"key":"If070nZ9DF"}],"key":"ObFHKdkGbQ"}],"key":"FpxCEnKeLr"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def eval_deterministic_infinite(\n mdp: MDP, policy: Float[Array, \"S A\"]\n) -> Float[Array, \" S\"]:\n pi = jnp.argmax(policy, axis=1) # un-one-hot\n P_π = mdp.P[jnp.arange(mdp.S), pi]\n r_π = mdp.r[jnp.arange(mdp.S), pi]\n return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)","key":"kPTXzoVkjP"},{"type":"output","id":"lR3IMnfeh6ceeBjRS-hp8","data":[],"key":"vnJ2mohdEf"}],"data":{},"key":"i8ZGFIxzlB"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_eval_infinite","identifier":"tidy_eval_infinite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":1162,"column":1},"end":{"line":1162,"column":1}},"key":"S5k5uQGZ3n"}],"key":"bkK7dKODuj"},{"type":"paragraph","position":{"start":{"line":1165,"column":1},"end":{"line":1166,"column":1}},"children":[{"type":"text","value":"Let’s use the same policy ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"tPkaRi2RLc"},{"type":"text","value":"π","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"qwawNw7TgY"},{"type":"text","value":" that tidies if and only if the room is\nmessy. Setting ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"wBzFuAApt8"},{"type":"inlineMath","value":"\\gamma = 0.95","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"html":"γ=0.95\\gamma = 0.95γ=0.95","key":"CbDNzJHtrx"},{"type":"text","value":", we must invert","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"V3CvTq3D6i"}],"key":"YXTGaeCEWY"},{"type":"math","value":"I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.","position":{"start":{"line":1168,"column":1},"end":{"line":1168,"column":1}},"html":"IγPπ=[10.95×0.70.95×0.30.95×110.95×0]=[0.3350.2850.951].I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.IγPπ=[10.95×0.70.95×10.95×0.310.95×0]=[0.3350.950.2851].","enumerator":"1.45","key":"tasrJoEOIn"},{"type":"paragraph","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"children":[{"type":"text","value":"The inverse to two decimal points is","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"key":"kZWniX1e2g"}],"key":"i6uHJnUmPc"},{"type":"math","value":"(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.","position":{"start":{"line":1172,"column":1},"end":{"line":1172,"column":1}},"html":"(IγPπ)1=[15.564.4414.795.21].(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.(IγPπ)1=[15.5614.794.445.21].","enumerator":"1.46","key":"wjXCgx14ke"},{"type":"paragraph","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"children":[{"type":"text","value":"Thus the value function is","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"key":"HRtjY5nMt3"}],"key":"wODe0cmsev"},{"type":"math","value":"V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.","position":{"start":{"line":1176,"column":1},"end":{"line":1176,"column":1}},"html":"Vπ=(IγPπ)1rπ=[15.564.4414.795.21][10]=[15.5614.79].V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.Vπ=(IγPπ)1rπ=[15.5614.794.445.21][10]=[15.5614.79].","enumerator":"1.47","key":"BXfGD9iaVT"},{"type":"paragraph","position":{"start":{"line":1178,"column":1},"end":{"line":1181,"column":1}},"children":[{"type":"text","value":"Let’s sanity-check this result. Since rewards are at most ","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"IpHCKZQxbr"},{"type":"text","value":"1","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"XpOwJwV4WV"},{"type":"text","value":", the\nmaximum cumulative return of a trajectory is at most\n","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"WNp3xdseHg"},{"type":"inlineMath","value":"1/(1-\\gamma) = 20","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"html":"1/(1γ)=201/(1-\\gamma) = 201/(1γ)=20","key":"w9O9ighFVC"},{"type":"text","value":". We see that the value function is indeed slightly\nlower than this.","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"wRZq20WDDj"}],"key":"er8NUTOwyR"}],"enumerator":"1.6","html_id":"tidy-eval-infinite","key":"r3uzYwNogw"}],"key":"xTErSz1mtM"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"WpAZJi0yZ1"},{"type":"output","id":"dVXQnKoDUfx14cYpjTGaf","data":[{"output_type":"execute_result","execution_count":19,"metadata":{},"data":{"text/plain":{"content":"Array([15.56419, 14.78598], dtype=float32)","content_type":"text/plain"}}}],"key":"L3ypiFP93G"}],"data":{},"key":"Y0v9LipI2R"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"children":[{"type":"text","value":"Iterative policy evaluation","position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"key":"dSpwlulXai"}],"label":"iterative_pe","identifier":"iterative_pe","html_id":"iterative-pe","enumerator":"1.5.2.2","key":"E7TskrYI2Y"},{"type":"paragraph","position":{"start":{"line":1191,"column":1},"end":{"line":1194,"column":1}},"children":[{"type":"text","value":"The matrix inversion above takes roughly ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"Z5yHYL3GeG"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^3)","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"html":"O(S3)O(|\\mathcal{S}|^3)O(S3)","key":"az1AaHbVon"},{"type":"text","value":" time.\nIt also only works for deterministic policies.\nCan we trade off the requirement of finding the ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"X9LmoYItLM"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"exact","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"WiyVt0AR3t"}],"key":"PqMl70YGQI"},{"type":"text","value":" value function for a faster\n","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"AuVwiqAodP"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"WvYeYb6kmM"}],"key":"LYN7yhlhrC"},{"type":"text","value":" algorithm that will also extend to stochastic policies?","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"cDl0YHAkyT"}],"key":"eXW3uvqoK6"},{"type":"paragraph","position":{"start":{"line":1196,"column":1},"end":{"line":1199,"column":1}},"children":[{"type":"text","value":"Let’s use the Bellman operator to define an iterative algorithm for\ncomputing the value function. We’ll start with an initial guess\n","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"yMMUE5WIon"},{"type":"inlineMath","value":"v^{(0)}","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"v(0)v^{(0)}v(0)","key":"gul287JjEY"},{"type":"text","value":" with elements in ","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"UJUumCNW1p"},{"type":"inlineMath","value":"[0, 1/(1-\\gamma)]","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"[0,1/(1γ)][0, 1/(1-\\gamma)][0,1/(1γ)]","key":"fyE4lnrSJt"},{"type":"text","value":" and then iterate the\nBellman operator:","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"qx9oV0foy6"}],"key":"yJAONbsPM8"},{"type":"math","value":"v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),","position":{"start":{"line":1201,"column":1},"end":{"line":1201,"column":1}},"html":"v(t+1)=Jπ(v(t)),v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),v(t+1)=Jπ(v(t)),","enumerator":"1.48","key":"lqLXYXJ6wA"},{"type":"paragraph","position":{"start":{"line":1203,"column":1},"end":{"line":1204,"column":1}},"children":[{"type":"text","value":"i.e. ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"lGfRP9THtn"},{"type":"inlineMath","value":"v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"v(t)=(Jπ)(t)(v(0))v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})v(t)=(Jπ)(t)(v(0))","key":"Ro9Rt3TXlF"},{"type":"text","value":". Note that each iteration\ntakes ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"ygZ57dScnJ"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^2)","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"O(S2)O(|\\mathcal{S}|^2)O(S2)","key":"qAglu9BtRi"},{"type":"text","value":" time for the matrix-vector multiplication.","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"FWlWpJqE7M"}],"key":"qRjOSnq0OT"}],"key":"wZ4oY61jSN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def supremum_norm(v):\n return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf)\n\n\ndef loop_until_convergence(op, v, ε=1e-6):\n \"\"\"Repeatedly apply op to v until convergence (in supremum norm).\"\"\"\n while True:\n v_new = op(v)\n if supremum_norm(v_new - v) < ε:\n return v_new\n v = v_new\n\n\ndef iterative_evaluation(mdp: MDP, pi: Float[Array, \"S A\"], ε=1e-6) -> Float[Array, \" S\"]:\n op = partial(bellman_operator, mdp, pi)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"NKZJJZD0RW"},{"type":"output","id":"Gar_yNIiFG5vOubSiOYqW","data":[],"key":"ichaQyjNeI"}],"data":{},"key":"u3YBTIoJ4K"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"children":[{"type":"text","value":"Then, as we showed in ","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"OZ6NyAA6nQ"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"y0XUKQY26K"},{"type":"text","value":"1.38","key":"D55SdQuieN"},{"type":"text","value":")","key":"A5RyjErlGE"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"fUIMYjsVCR"},{"type":"text","value":", by the Banach fixed-point theorem:","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"XY8kXNH2SC"}],"key":"a63cdSY8hj"},{"type":"math","value":"\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.","position":{"start":{"line":1227,"column":1},"end":{"line":1227,"column":1}},"html":"v(t)Vπγtv(0)Vπ.\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.v(t)Vπγtv(0)Vπ.","enumerator":"1.49","key":"fLQkxhhO4P"}],"key":"kDGIX13zcR"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"nmnSTUeqF8"},{"type":"output","id":"3LoPYbIed8hZgY1CUcFqQ","data":[{"output_type":"execute_result","execution_count":21,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"uRhlqvxYXV"}],"data":{},"key":"m8oY93g6oS"},{"type":"block","children":[{"type":"proof","kind":"remark","label":"iterations_vi","identifier":"iterations_vi","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Convergence of iterative policy evaluation","position":{"start":{"line":1233,"column":1},"end":{"line":1233,"column":1}},"key":"dnO4q3xDGG"}],"key":"S0R7nQi8hO"},{"type":"paragraph","position":{"start":{"line":1236,"column":1},"end":{"line":1237,"column":1}},"children":[{"type":"text","value":"How many iterations do we need for an ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"Mo4TvOb6p5"},{"type":"text","value":"ε","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"Nf6f0nSsIR"},{"type":"text","value":"-accurate estimate? We\ncan work backwards to solve for ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"dCExuBr0ZT"},{"type":"inlineMath","value":"t","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"html":"ttt","key":"k4VOZB0qrg"},{"type":"text","value":":","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"jyisDQezf0"}],"key":"aDWVGVr5Sc"},{"type":"math","value":"\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}","position":{"start":{"line":1239,"column":1},"end":{"line":1245,"column":1}},"html":"γtv(0)Vπϵtlog(ϵ/v(0)Vπ)logγ=log(v(0)Vπ/ϵ)log(1/γ),\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}γtv(0)Vπtϵlogγlog(ϵ/∥v(0)Vπ)=log(1/γ)log(v(0)Vπ/ϵ),","enumerator":"1.50","key":"yG4D1PBCCM"},{"type":"paragraph","position":{"start":{"line":1247,"column":1},"end":{"line":1248,"column":1}},"children":[{"type":"text","value":"and so the number of iterations required for an\n","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"v2kK6UDuAy"},{"type":"text","value":"ε","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"IGltSXzbQX"},{"type":"text","value":"-accurate estimate is","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"MjaTIVjFi7"}],"key":"l2vLaW6jTG"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1250,"column":1},"end":{"line":1252,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.51","key":"qLHncLcO9y"},{"type":"paragraph","position":{"start":{"line":1254,"column":1},"end":{"line":1256,"column":1}},"children":[{"type":"text","value":"Note that we’ve applied the inequalities\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"Z5VbywXBV7"},{"type":"inlineMath","value":"\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"v(0)Vπ1/(1γ)\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)v(0)Vπ1/(1γ)","key":"iTYWsNs2By"},{"type":"text","value":" and\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"kJtU9012qT"},{"type":"inlineMath","value":"\\log (1/x) \\ge 1-x","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"log(1/x)1x\\log (1/x) \\ge 1-xlog(1/x)1x","key":"JKyBkVpSqi"},{"type":"text","value":".","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"unaXZIqLsu"}],"key":"wXwD74NG18"}],"enumerator":"1.2","html_id":"iterations-vi","key":"xL3L5ixgSX"},{"type":"heading","depth":3,"position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"children":[{"type":"text","value":"Optimal policies in infinite-horizon MDPs","position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"key":"rR0tXQ37g0"}],"identifier":"optimal-policies-in-infinite-horizon-mdps","label":"Optimal policies in infinite-horizon MDPs","html_id":"optimal-policies-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.3","key":"rmgLCS5WYv"},{"type":"paragraph","position":{"start":{"line":1261,"column":1},"end":{"line":1266,"column":1}},"children":[{"type":"text","value":"Now let’s move on to solving for an optimal policy in the\ninfinite-horizon case. As in ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"VDCJs4q9sj"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_policy_finite","label":"optimal_policy_finite","children":[{"type":"text","value":"the finite-horizon case","key":"Caa1wzMSFN"}],"template":"Definition %s","enumerator":"1.10","resolved":true,"html_id":"optimal-policy-finite","key":"FqVz0rpilf"},{"type":"text","value":", an ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"GASkshhiq5"},{"type":"strong","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"bfL9iuqmi2"}],"key":"aG3sdGIhqd"},{"type":"text","value":" ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"nFQmo0Xo2a"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"π\\pi^\\starπ","key":"BgRaFsz5DD"},{"type":"text","value":"\nis one that does at least as well as any other policy in all situations.\nThat is, for all policies ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"LSdU6fq1bc"},{"type":"text","value":"π","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"WNS6hlb4uU"},{"type":"text","value":", states ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"Ertv187Yd9"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"Mn3RtgWjaa"},{"type":"text","value":", times\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"XZVuZJ48Me"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"lZW4Kd0kcM"},{"type":"text","value":", and initial trajectories\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"g5Lv6ryLD9"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"VB6Gb1r7OD"},{"type":"text","value":" where ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"ZUkDCetbEu"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"HEgKXIJXek"},{"type":"text","value":",","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"EgxBqAi5rf"}],"key":"oxmKFGrsNM"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}","label":"optimal_policy_infinite","identifier":"optimal_policy_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]","enumerator":"1.52","html_id":"optimal-policy-infinite","key":"ZMMBwnL3u4"},{"type":"paragraph","position":{"start":{"line":1278,"column":1},"end":{"line":1279,"column":1}},"children":[{"type":"text","value":"Once again, all optimal policies share the same ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"iC7A7z18em"},{"type":"strong","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"DVSyvHSUIN"}],"key":"lZXKkRcqnk"},{"type":"text","value":" ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"wEGTckKLdz"},{"type":"inlineMath","value":"V^\\star","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"html":"VV^\\starV","key":"fleKkBhDF2"},{"type":"text","value":", and the greedy policy with respect to this value function\nis optimal.","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"wKcnJL6GnI"}],"key":"OK8JYg66Od"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"zCxmgufPX7"}],"key":"bFTS0xUwCY"},{"type":"paragraph","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"children":[{"type":"text","value":"Verify this by modifying the proof ","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"P0rGKhmmTW"},{"type":"crossReference","kind":"proof:theorem","identifier":"optimal_greedy","label":"optimal_greedy","children":[{"type":"text","value":"Theorem ","key":"pFVtpIHU4Z"},{"type":"text","value":"1.3","key":"w2zHu6wt69"}],"template":"Theorem %s","enumerator":"1.3","resolved":true,"html_id":"optimal-greedy","key":"VUFLdablgK"},{"type":"text","value":" from the finite-horizon case.","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"cLzBZicUJp"}],"key":"dAFfXeaEZE"}],"key":"oyIAhR3Pib"},{"type":"paragraph","position":{"start":{"line":1285,"column":1},"end":{"line":1289,"column":1}},"children":[{"type":"text","value":"So how can we compute such an optimal policy? We can’t use the backwards\nDP approach from the finite-horizon case ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"EBKxoHoXJI"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"Definition ","key":"nNoHz9sNOk"},{"type":"text","value":"1.11","key":"DHJIVewtfo"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","key":"pd97uDlMgY"},{"type":"text","value":" since there’s no “final timestep” to start\nfrom. Instead, we’ll exploit the fact that the Bellman consistency\nequation ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"hHF5rKQZIT"},{"type":"crossReference","kind":"equation","identifier":"bellman_consistency_infinite","label":"bellman_consistency_infinite","children":[{"type":"text","value":"(","key":"m9sMi2O2qA"},{"type":"text","value":"1.32","key":"q9c1qIhml2"},{"type":"text","value":")","key":"PlFiLd1SIl"}],"template":"(%s)","enumerator":"1.32","resolved":true,"html_id":"bellman-consistency-infinite","key":"GxK7F4z60W"},{"type":"text","value":" for the optimal value\nfunction doesn’t depend on any policy:","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"Z6XpXSDdPO"}],"key":"PS9CPOySUs"},{"type":"math","value":"V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]","label":"bellman_optimality","identifier":"bellman_optimality","html":"V(s)=maxa[r(s,a)+γEsP(s,a)V(s).]V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]V(s)=amax[r(s,a)+γEsP(s,a)V(s).]","enumerator":"1.53","html_id":"bellman-optimality","key":"X6Kx5LBtxn"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"lVHkqTRyLS"}],"key":"ytDwpbuffU"},{"type":"paragraph","position":{"start":{"line":1298,"column":1},"end":{"line":1299,"column":1}},"children":[{"type":"text","value":"Verify this by substituting the greedy policy into the\nBellman consistency equation.","position":{"start":{"line":1298,"column":1},"end":{"line":1298,"column":1}},"key":"hnVrFyBAkU"}],"key":"BOCaVazTwM"}],"key":"puGpkqYJDA"},{"type":"paragraph","position":{"start":{"line":1302,"column":1},"end":{"line":1303,"column":1}},"children":[{"type":"text","value":"As before, thinking of the r.h.s. of ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"lMiL6z6en5"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality","label":"bellman_optimality","children":[{"type":"text","value":"(","key":"caHl5JVNni"},{"type":"text","value":"1.53","key":"H30Vg4vmPE"},{"type":"text","value":")","key":"QCu04tAbls"}],"template":"(%s)","enumerator":"1.53","resolved":true,"html_id":"bellman-optimality","key":"fpbTwTQj0N"},{"type":"text","value":" as an operator on value functions\ngives the ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"UqN3JPePVu"},{"type":"strong","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"children":[{"type":"text","value":"Bellman optimality operator","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"yIdOAUGIsH"}],"key":"VEvqBmfsYM"}],"key":"FA9eNzf0tE"},{"type":"math","value":"[\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right]","label":"bellman_optimality_operator","identifier":"bellman_optimality_operator","html":"[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right][J(v)](s)=amax[r(s,a)+γEsP(s,a)v(s)]","enumerator":"1.54","html_id":"bellman-optimality-operator","key":"TRmPxm6HvD"}],"key":"PZsM7GvEvn"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_optimality_operator(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \" S\"]:\n return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)\n\n\ndef check_optimal(v: Float[Array, \" S\"], mdp: MDP):\n return jnp.allclose(v, bellman_optimality_operator(v, mdp))","key":"gLwFDv7NuW"},{"type":"output","id":"XzEoiYoOYIzM_jpa8U6CJ","data":[],"key":"QRinva7u61"}],"data":{},"key":"qUw0M5E0EA"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"children":[{"type":"text","value":"Value iteration","position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"key":"L0lykyIuea"}],"label":"value_iteration","identifier":"value_iteration","html_id":"value-iteration","enumerator":"1.5.3.1","key":"jflBhhfkZB"},{"type":"paragraph","position":{"start":{"line":1323,"column":1},"end":{"line":1326,"column":1}},"children":[{"type":"text","value":"Since the optimal policy is still a policy, our result that the Bellman\noperator is a contracting map still holds, and so we can repeatedly\napply this operator to converge to the optimal value function! This\nalgorithm is known as ","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"TTPR88kvW7"},{"type":"strong","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"CwXLxhI0i2"}],"key":"y00bHfI3CZ"},{"type":"text","value":".","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"yUZIUHOANp"}],"key":"I620Wri0Zj"}],"key":"peErTZmTUC"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, \" S\"]:\n \"\"\"Iterate the Bellman optimality operator until convergence.\"\"\"\n op = partial(bellman_optimality_operator, mdp)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"ca4r0nmy69"},{"type":"output","id":"L94hPD6FUYuixquPi7rzm","data":[],"key":"hGKWm6Lg9U"}],"data":{},"key":"q8DhcJfIMw"},{"type":"block","children":[],"key":"QQmdtX5SS3"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"value_iteration(tidy_mdp_inf)","key":"gpdp5PCMpq"},{"type":"output","id":"bYIrykIWhLK07flL-L3IV","data":[{"output_type":"execute_result","execution_count":24,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"ebgC4I5W0X"}],"data":{},"key":"xbtS56pw6p"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1339,"column":1},"end":{"line":1342,"column":1}},"children":[{"type":"text","value":"Note that the runtime analysis for an ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"Q0f9WyFwMJ"},{"type":"text","value":"ε","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"Ld6naUhhJG"},{"type":"text","value":"-optimal value function\nis exactly the same as ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"v04spzYJ6Y"},{"type":"crossReference","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"iterative policy evaluation","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"n34Ouk295F"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","key":"oA6oZs9GbW"},{"type":"text","value":"! This is because value iteration is simply\nthe special case of applying iterative policy evaluation to the\n","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"FcNOqDN7DT"},{"type":"emphasis","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"EQNNhL0rVt"}],"key":"iejoQ70Ohw"},{"type":"text","value":" value function.","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"bqzrWm962n"}],"key":"j6mglYbioe"},{"type":"paragraph","position":{"start":{"line":1344,"column":1},"end":{"line":1346,"column":1}},"children":[{"type":"text","value":"As the final step of the algorithm, to return an actual policy\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"e0oHqRBvdN"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"π^\\hat \\piπ^","key":"qD4Dqsc02X"},{"type":"text","value":", we can simply act greedily with respect to the final iteration\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"a0As6AZ0Yr"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"nldFdnL7iE"},{"type":"text","value":" of our above algorithm:","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"Jq7KgGVVNx"}],"key":"XgHZIjMm7v"},{"type":"math","value":"\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].","position":{"start":{"line":1348,"column":1},"end":{"line":1348,"column":1}},"html":"π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].π^(s)=argamax[r(s,a)+γEsP(s,a)v(T)(s)].","enumerator":"1.55","key":"f5JUlylpID"},{"type":"paragraph","position":{"start":{"line":1350,"column":1},"end":{"line":1352,"column":1}},"children":[{"type":"text","value":"We must be careful, though: the value function of this greedy policy,\n","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"jVZFyz7wNw"},{"type":"inlineMath","value":"V^{\\hat \\pi}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"Vπ^V^{\\hat \\pi}Vπ^","key":"nS8Ika3IIe"},{"type":"text","value":", is ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"HP4KiHS8Ic"},{"type":"emphasis","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"m3RbuNXhG2"}],"key":"wikRXaFjSp"},{"type":"text","value":" the same as ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"qxrmnUllPf"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"yoOoWgppDD"},{"type":"text","value":", which need not even be a\nwell-defined value function for some policy!","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"k5NIYOrnmR"}],"key":"WbLtzhjeZw"},{"type":"paragraph","position":{"start":{"line":1354,"column":1},"end":{"line":1358,"column":1}},"children":[{"type":"text","value":"The bound on the policy’s quality is actually quite loose: if\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"fgsNtbK1ow"},{"type":"inlineMath","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"v(T)Vϵ\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilonv(T)Vϵ","key":"pkruFIJ5dl"},{"type":"text","value":", then the greedy policy\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"s8eADOwLz1"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"π^\\hat \\piπ^","key":"VvXq0E8jAx"},{"type":"text","value":" satisfies\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"X0iTOQeVsx"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"Vπ^V2γ1γϵ\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilonVπ^V1γ2γϵ","key":"VTddphxPJ8"},{"type":"text","value":",\nwhich might potentially be very large.","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"Z1SySxffX4"}],"key":"pYK53JkHQQ"},{"type":"proof","kind":"theorem","label":"greedy_worsen","identifier":"greedy_worsen","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Greedy policy value worsening","position":{"start":{"line":1360,"column":1},"end":{"line":1360,"column":1}},"key":"oyp20tWbOx"}],"key":"qQ2JMf2dUU"},{"type":"math","value":"\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}","position":{"start":{"line":1363,"column":1},"end":{"line":1363,"column":1}},"html":"Vπ^V2γ1γvV\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}Vπ^V1γ2γvV","enumerator":"1.56","key":"rjdHVy6WEI"},{"type":"paragraph","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"tDlouxDsCq"},{"type":"inlineMath","value":"\\hat \\pi(s) = \\arg\\max_a q(s, a)","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"html":"π^(s)=argmaxaq(s,a)\\hat \\pi(s) = \\arg\\max_a q(s, a)π^(s)=argmaxaq(s,a)","key":"StQIBjYMJz"},{"type":"text","value":" is the greedy policy with respect to","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"pWM7EyZrYy"}],"key":"YP6f3iq3bz"},{"type":"math","value":"q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').","position":{"start":{"line":1367,"column":1},"end":{"line":1367,"column":1}},"html":"q(s,a)=r(s,a)+EsP(s,a)v(s).q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').q(s,a)=r(s,a)+EsP(s,a)v(s).","enumerator":"1.57","key":"bEDuyoctCc"}],"enumerator":"1.5","html_id":"greedy-worsen","key":"Hx4IIQHf8P"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":1370,"column":1},"end":{"line":1370,"column":1}},"key":"ZnY9K8QQvL"}],"key":"JlDZ4U9Ynp"},{"type":"paragraph","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"children":[{"type":"text","value":"We first have","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"key":"SZTP5NPPg8"}],"key":"tlpjnfTKgy"},{"type":"math","value":"\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}","position":{"start":{"line":1373,"column":1},"end":{"line":1378,"column":1}},"html":"V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].","enumerator":"1.58","key":"WFyC3SvzOu"},{"type":"paragraph","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"children":[{"type":"text","value":"Let’s bound these two quantities separately.","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"key":"qoj8BovrbY"}],"key":"k7hTcAmWkS"},{"type":"paragraph","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"children":[{"type":"text","value":"For the first quantity, note that by the definition of ","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"SwwiLXCSsS"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"html":"π^\\hat \\piπ^","key":"CJZ8HU47OQ"},{"type":"text","value":", we have","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"t5uPdVDqAM"}],"key":"jWxyjyN5tC"},{"type":"math","value":"q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).","position":{"start":{"line":1384,"column":1},"end":{"line":1384,"column":1}},"html":"q(s,π^(s))q(s,π(s)).q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).q(s,π^(s))q(s,π(s)).","enumerator":"1.59","key":"en03GnG2De"},{"type":"paragraph","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"children":[{"type":"text","value":"Let’s add ","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"LV84V2iMPh"},{"type":"inlineMath","value":"q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"html":"q(s,π^(s))q(s,π(s))0q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0q(s,π^(s))q(s,π(s))0","key":"LEV8dgXEmO"},{"type":"text","value":" to the first term to get","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"KGuwPKp43L"}],"key":"fSumpDMEwy"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1388,"column":1},"end":{"line":1394,"column":1}},"html":"Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.","enumerator":"1.60","key":"bk6Sif0WYr"},{"type":"paragraph","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"children":[{"type":"text","value":"The second quantity is bounded by","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"key":"WUiXsBm8qj"}],"key":"qJK6aQaCTe"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}","position":{"start":{"line":1399,"column":1},"end":{"line":1407,"column":1}},"html":"Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^","enumerator":"1.61","key":"PdOLUUZ0m6"},{"type":"paragraph","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"children":[{"type":"text","value":"and thus","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"key":"mopslW7LZW"}],"key":"Y32jcMOp0p"},{"type":"math","value":"\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}","position":{"start":{"line":1411,"column":1},"end":{"line":1416,"column":1}},"html":"VVπ^2γvV+γVVπ^VVπ^2γvV1γ.\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}VVπ^VVπ^2γvV+γVVπ^1γ2γvV.","enumerator":"1.62","key":"HSEuixOVup"}],"enumerator":"1.3","key":"PtUCuY01yY"},{"type":"paragraph","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"children":[{"type":"text","value":"So in order to compensate and achieve ","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"B9oW1ZKgeo"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilon","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"html":"Vπ^Vϵ\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilonVπ^Vϵ","key":"mz6qwg6Tje"},{"type":"text","value":", we must have","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"WVQyd3MydZ"}],"key":"zAwiladqT5"},{"type":"math","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.","position":{"start":{"line":1421,"column":1},"end":{"line":1421,"column":1}},"html":"v(T)V1γ2γϵ.\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.v(T)V2γ1γϵ.","enumerator":"1.63","key":"BkRQoNx2Ro"},{"type":"paragraph","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"children":[{"type":"text","value":"This means, using ","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"XuUefj8sS7"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"oRrOaBLVJ4"},{"type":"text","value":"1.2","key":"iC7oVV5Ju2"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"b39PD9TiN3"},{"type":"text","value":", we need to run value iteration for","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"FgCjVmlOJK"}],"key":"UBlMX35deV"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)","position":{"start":{"line":1425,"column":1},"end":{"line":1425,"column":1}},"html":"T=O(11γlog(γϵ(1γ)2))T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)T=O(1γ1log(ϵ(1γ)2γ))","enumerator":"1.64","key":"kwubnaPEot"},{"type":"paragraph","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"children":[{"type":"text","value":"iterations to achieve an ","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"NQUK9GptfS"},{"type":"text","value":"ε","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"v9n4v2uXfw"},{"type":"text","value":"-accurate estimate of the optimal value function.","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"VIO6KOCGtl"}],"key":"VHjir1olZY"},{"type":"heading","depth":4,"position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"children":[{"type":"text","value":"Policy iteration","position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"key":"QCT94pwo4q"}],"label":"policy_iteration","identifier":"policy_iteration","html_id":"policy-iteration","enumerator":"1.5.3.2","key":"n8JR61T4g4"},{"type":"paragraph","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"teZreTggUJ"},{"type":"emphasis","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"together","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"bHAYpeFbI9"}],"key":"BYvRFH2SGb"},{"type":"text","value":"? This is the idea behind ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"N3PLkifGUd"},{"type":"strong","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"JsFPWbwWD4"}],"key":"gebl74jMzF"},{"type":"text","value":". In each step, we simply set the policy to act greedily with respect to its own value function.","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"HBwFsQFNbd"}],"key":"Si3VRmFlON"}],"key":"TKDr3lz82m"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, \"S A\"]:\n \"\"\"Iteratively improve the policy and value function.\"\"\"\n def op(pi):\n return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))\n π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy\n return loop_until_convergence(op, π_init, ε)","key":"KxVVHWvqzn"},{"type":"output","id":"hv-N829sHK89aKw3irEK9","data":[],"key":"YvFwfJuK9t"}],"data":{},"key":"VwXXJACex6"},{"type":"block","children":[],"key":"JDFKoj5DBN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"policy_iteration(tidy_mdp_inf)","key":"rfqdLc63iK"},{"type":"output","id":"Kn8nUTYNhhNsMZj_kgAWi","data":[{"output_type":"execute_result","execution_count":26,"metadata":{},"data":{"text/plain":{"content":"Array([[1., 0.],\n [0., 1.]], dtype=float32)","content_type":"text/plain"}}}],"key":"oHth0SLq2A"}],"data":{},"key":"UqPeQ4CsyY"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"children":[{"type":"text","value":"Although PI appears more complex than VI, we’ll use the same contraction property ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"zeDxThNdDs"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"ZcfRSQmT97"},{"type":"text","value":"1.4","key":"onXizweskL"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"cRYldbIdA0"},{"type":"text","value":" to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"wFIMoYgtK7"},{"type":"text","value":"ε","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"Jh0MVz0PpB"},{"type":"text","value":"-optimal value function ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"IJAJhHsCVm"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"JbOxAqOryA"},{"type":"text","value":"1.2","key":"Wlfk3Eqep1"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"E3wLw0J8cz"},{"type":"text","value":", although in practice, PI often converges much faster.","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"tO3PIS0NPq"}],"key":"sIuCzu9qvv"},{"type":"proof","kind":"theorem","label":"pi_iter_analysis","identifier":"pi_iter_analysis","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy Iteration runtime and convergence","position":{"start":{"line":1450,"column":1},"end":{"line":1450,"column":1}},"key":"LaQM5SHabT"}],"key":"S9x37Sq33Q"},{"type":"paragraph","position":{"start":{"line":1453,"column":1},"end":{"line":1454,"column":1}},"children":[{"type":"text","value":"We aim to show that the number of iterations required for an\n","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"Cughv7EYbr"},{"type":"text","value":"ε","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"zJX9xBcvs0"},{"type":"text","value":"-accurate estimate of the optimal value function is","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"B1Xys9Qxp4"}],"key":"sawXfSll3X"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1456,"column":1},"end":{"line":1456,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.65","key":"eKNZcwaqbc"},{"type":"paragraph","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"children":[{"type":"text","value":"This bound follows from the contraction property ","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"IfpIJvOgP4"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"XRBWSkEhMm"},{"type":"text","value":"1.38","key":"LZYAZfElTp"},{"type":"text","value":")","key":"piAQflLbQW"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"ZYcmR8uvtr"},{"type":"text","value":":","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"yH51fsPfi3"}],"key":"FWABO1E4t3"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1460,"column":1},"end":{"line":1460,"column":1}},"html":"Vπt+1VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VγVπtV.","enumerator":"1.66","key":"CEUKvfxrE9"},{"type":"paragraph","position":{"start":{"line":1462,"column":1},"end":{"line":1463,"column":1}},"children":[{"type":"text","value":"We’ll prove that the iterates of PI respect the contraction property by\nshowing that the policies improve monotonically:","position":{"start":{"line":1462,"column":1},"end":{"line":1462,"column":1}},"key":"TrmLg5jqbg"}],"key":"lZIQhyOj0r"},{"type":"math","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).","position":{"start":{"line":1465,"column":1},"end":{"line":1465,"column":1}},"html":"Vπt+1(s)Vπt(s).V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).Vπt+1(s)Vπt(s).","enumerator":"1.67","key":"bn1MWev8xP"},{"type":"paragraph","position":{"start":{"line":1467,"column":1},"end":{"line":1468,"column":1}},"children":[{"type":"text","value":"Then we’ll use this to show\n","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"BZnClAPhMY"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"html":"Vπt+1(s)[J(Vπt)](s)V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)Vπt+1(s)[J(Vπt)](s)","key":"ak7z9AaZVX"},{"type":"text","value":". Note that","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"jBcCo1MEg6"}],"key":"me8FJeDANj"},{"type":"math","value":"\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}","position":{"start":{"line":1470,"column":1},"end":{"line":1475,"column":1}},"html":"(s)=maxa[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}(s)=amax[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)","enumerator":"1.68","key":"sK6m26dt6u"},{"type":"paragraph","position":{"start":{"line":1477,"column":1},"end":{"line":1478,"column":1}},"children":[{"type":"text","value":"Since\n","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"F289oALmn2"},{"type":"inlineMath","value":"[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"html":"[J(Vπt)](s)Vπt(s)[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)[J(Vπt)](s)Vπt(s)","key":"rbeXx3vrpW"},{"type":"text","value":", we then have","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"TkCuSEAFpu"}],"key":"B0CpGP7HL7"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}","label":"pi_iter_proof","identifier":"pi_iter_proof","html":"Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].","enumerator":"1.69","html_id":"pi-iter-proof","key":"fERdIxiUdN"},{"type":"paragraph","position":{"start":{"line":1489,"column":1},"end":{"line":1492,"column":1}},"children":[{"type":"text","value":"But note that the\nexpression being averaged is the same as the expression on the l.h.s.\nwith ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"J2M2es2L6Y"},{"type":"inlineMath","value":"s","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"sss","key":"tkszrBvlQK"},{"type":"text","value":" replaced by ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"hMrMlHXjEo"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"ss's","key":"YoGIVtkHNT"},{"type":"text","value":". So we can apply the same inequality\nrecursively to get","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"flC4JQIktV"}],"key":"sVHa1NYscY"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}","position":{"start":{"line":1494,"column":1},"end":{"line":1500,"column":1}},"html":"Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))sP(s,πt+1(s))[Vπt+1(s)Vπt(s)]\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))s′′P(s,πt+1(s))[Vπt+1(s′′)Vπt(s′′)]","enumerator":"1.70","key":"K6QVSK8zo2"},{"type":"paragraph","position":{"start":{"line":1502,"column":1},"end":{"line":1506,"column":1}},"children":[{"type":"text","value":"which implies that ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"puU6Do8ql2"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"Vπt+1(s)Vπt(s)V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)Vπt+1(s)Vπt(s)","key":"SOAonUhf4d"},{"type":"text","value":"\nfor all ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"MDjZmBMwvq"},{"type":"inlineMath","value":"s","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"sss","key":"qHS1wmomcC"},{"type":"text","value":" (since the r.h.s. converges to zero). We can then plug this\nback into\n","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"nsFK4uFE9m"},{"type":"crossReference","kind":"equation","identifier":"pi_iter_proof","label":"pi_iter_proof","children":[{"type":"text","value":"(","key":"DhTBChoudr"},{"type":"text","value":"1.69","key":"oirvDhVDfx"},{"type":"text","value":")","key":"ng1aL1iSM7"}],"template":"(%s)","enumerator":"1.69","resolved":true,"html_id":"pi-iter-proof","key":"Ur1h7eAow2"},{"type":"text","value":"\nto get the desired result:","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"syzvlYyaQt"}],"key":"eOp0j3Xq4Y"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}","position":{"start":{"line":1508,"column":1},"end":{"line":1514,"column":1}},"html":"Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0Vπt+1(s)[J(Vπt)](s)\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}Vπt+1(s)J(Vπt)(s)Vπt+1(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0[J(Vπt)](s)","enumerator":"1.71","key":"e98qyJowdR"},{"type":"paragraph","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"children":[{"type":"text","value":"This means we can now apply the Bellman convergence result ","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"anZZJu8j2s"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"CWntIUgTDP"},{"type":"text","value":"1.38","key":"PjYH5Av1Qp"},{"type":"text","value":")","key":"YA0qwsM6fz"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"zPGYQ4qfo8"},{"type":"text","value":" to get","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"RExDt3YPFj"}],"key":"OMGWCAwCTf"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1518,"column":1},"end":{"line":1518,"column":1}},"html":"Vπt+1VJ(Vπt)VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VJ(Vπt)VγVπtV.","enumerator":"1.72","key":"fjCWosrDG7"}],"enumerator":"1.6","html_id":"pi-iter-analysis","key":"acupTtfGOa"},{"type":"heading","depth":2,"position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"key":"HiTgvMqiAW"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"1.6","key":"iIhPyeDf8a"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":1523,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1523,"column":1},"end":{"line":1530,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1523,"column":1},"end":{"line":1529,"column":1}},"children":[{"type":"text","value":"Markov decision processes (MDPs) are a framework for sequential\ndecision making under uncertainty. They consist of a state space\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"oMHQURtos4"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"S\\mathcal{S}S","key":"K6D2D2yWr8"},{"type":"text","value":", an action space ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"Ur4uyeXJ6A"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"A\\mathcal{A}A","key":"RqDww6vffx"},{"type":"text","value":", an initial state distribution\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"dR7hrukRJl"},{"type":"inlineMath","value":"\\mu \\in \\Delta(\\mathcal{S})","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"μΔ(S)\\mu \\in \\Delta(\\mathcal{S})μΔ(S)","key":"YfcwVWN0uk"},{"type":"text","value":", a transition function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"MpvO0pQLsN"},{"type":"inlineMath","value":"P(s' \\mid s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"P(ss,a)P(s' \\mid s, a)P(ss,a)","key":"TWsDcF5Tlf"},{"type":"text","value":", and a\nreward function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"yQipNMaRNL"},{"type":"inlineMath","value":"r(s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"r(s,a)r(s, a)r(s,a)","key":"mlmxRLTPny"},{"type":"text","value":". They can be finite-horizon (ends after\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"AQgeIqICxL"},{"type":"inlineMath","value":"H","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"HHH","key":"AJWfOY71DZ"},{"type":"text","value":" timesteps) or infinite-horizon (where rewards scale by\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"q4cTlgdYl7"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"NuOyFGETIg"},{"type":"text","value":" at each timestep).","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"RYvU7Rw6YO"}],"key":"YmvrMmNBHx"}],"key":"k1lyU0beFv"},{"type":"listItem","spread":true,"position":{"start":{"line":1531,"column":1},"end":{"line":1535,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1531,"column":1},"end":{"line":1534,"column":1}},"children":[{"type":"text","value":"Our goal is to find a policy ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"Ls6kcJ2L5V"},{"type":"text","value":"π","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"bRbMVPdY69"},{"type":"text","value":" that maximizes expected total\nreward. Policies can be ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"c09rKbnihM"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"W5OMVi8sig"}],"key":"JkgoqZ3ulE"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"VS1bSwR9gy"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"baMBhIJAql"}],"key":"v6Q9mUwb3y"},{"type":"text","value":",\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"msuxV2RcZE"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"state-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"SdN6afErfm"}],"key":"IU5zc5YNbt"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"rEbqkEtl3P"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"history-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"iysAeJAK4J"}],"key":"tZr8T20NlD"},{"type":"text","value":", ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"HXJv7Tx9R1"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"vNA5sJck3k"}],"key":"eXawNTE1E3"},{"type":"text","value":" or\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"v6TTipqWsq"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"aoBiMc7bkc"}],"key":"Cz7JGeVD1P"},{"type":"text","value":".","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"AF4AyUpkwW"}],"key":"fEph84nQ2e"}],"key":"HHOrQ0yl1c"},{"type":"listItem","spread":true,"position":{"start":{"line":1536,"column":1},"end":{"line":1537,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"A policy induces a distribution over ","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"JpaaEq3OID"},{"type":"strong","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"trajectories","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"gR08ouaboC"}],"key":"zPcUn6Z58o"},{"type":"text","value":".","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"XsVtcFdKpo"}],"key":"nj9SudS8pR"}],"key":"JoztHIwldh"},{"type":"listItem","spread":true,"position":{"start":{"line":1538,"column":1},"end":{"line":1545,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1538,"column":1},"end":{"line":1544,"column":1}},"children":[{"type":"text","value":"We can evaluate a policy by computing its ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"xMG1XM5Ono"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"J9g0nnxyVI"}],"key":"MPs9ODGCwz"},{"type":"text","value":"\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"sZarxyDxFH"},{"type":"inlineMath","value":"V^\\pi(s)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Vπ(s)V^\\pi(s)Vπ(s)","key":"JOwfQ5qAE9"},{"type":"text","value":", which is the expected total reward starting from state\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"QYdRvAaUaF"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"A5BFl72mRE"},{"type":"text","value":" and following policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"c4RjMVz6Dl"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"VdO62r1ebP"},{"type":"text","value":". We can also compute the\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"xwQtKE89B2"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"state-action value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"I49DeFE84p"}],"key":"tDrPyap2Lg"},{"type":"text","value":" ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"JkIX0K4GUu"},{"type":"inlineMath","value":"Q^\\pi(s, a)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Qπ(s,a)Q^\\pi(s, a)Qπ(s,a)","key":"YwU3g8waBn"},{"type":"text","value":", which is the expected\ntotal reward starting from state ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"hgXe91Cooo"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"GRdZsASl40"},{"type":"text","value":", taking action ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"kILCbxBiim"},{"type":"inlineMath","value":"a","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"aaa","key":"LB5SJH02ot"},{"type":"text","value":", and then\nfollowing policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"R8UPl5laT2"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"ILFxuKCjJY"},{"type":"text","value":". In the finite-horizon setting, these also\ndepend on the timestep ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"AHOQIYYLFk"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"h\\hih","key":"ffyyJOt3mT"},{"type":"text","value":".","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"zZOD7ctGL3"}],"key":"SGHRyAbLyd"}],"key":"HpBjcOhkJ4"},{"type":"listItem","spread":true,"position":{"start":{"line":1546,"column":1},"end":{"line":1550,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1546,"column":1},"end":{"line":1549,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"YzrNnGWJyJ"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"EO2xqAbAWk"}],"key":"xQWbgJPq7p"},{"type":"text","value":" is an equation that the value\nfunction must satisfy. It can be used to solve for the value\nfunctions exactly. Thinking of the r.h.s. of this equation as an\noperator on value functions gives the ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"qSmwAohlXA"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"meAJPM3LHP"}],"key":"CZh5Xvb1HM"},{"type":"text","value":".","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"geknOuuN4e"}],"key":"YpozwVdoR8"}],"key":"kyHMUyV98Y"},{"type":"listItem","spread":true,"position":{"start":{"line":1551,"column":1},"end":{"line":1553,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1551,"column":1},"end":{"line":1552,"column":1}},"children":[{"type":"text","value":"In the finite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"MRcV7UTBfR"},{"type":"strong","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"eSt4yxA7wW"}],"key":"n5tagC3fXA"},{"type":"text","value":".","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"d9pNTpagwD"}],"key":"DigLmPe7US"}],"key":"zkTXywXLzP"},{"type":"listItem","spread":true,"position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"text","value":"In the infinite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"fF7GBGcqOE"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"SoEwz7EniC"}],"key":"YlGoh3ArFZ"},{"type":"text","value":" or ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"fz1ycg2Xgk"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"ofABOzPcnZ"}],"key":"dJUsVn1q4q"},{"type":"text","value":".","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"Up67IfFoYS"}],"key":"TgEjI88dWn"}],"key":"hPPDvIzksL"}],"key":"vAsPbcukBl"}],"key":"yx4u6IzIhO"}],"key":"CaDQXSmzwH"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"CS/STAT 184: Introduction to Reinforcement Learning","url":"/","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/myst.xref.json b/myst.xref.json index 6659245..1612e5b 100644 --- a/myst.xref.json +++ b/myst.xref.json @@ -1 +1 @@ -{"version":"1","myst":"1.3.7","references":[{"kind":"page","data":"/index.json","url":"/"},{"identifier":"prerequisites","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"reinforcement-learning-in-a-nutshell","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"core-tasks-of-reinforcement-learning","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"course-overview","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"notation","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"programming","kind":"heading","data":"/index.json","url":"/"},{"kind":"page","data":"/mdps.json","url":"/mdps"},{"identifier":"introduction","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"markov","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"definition","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"finite_horizon_mdp","html_id":"finite-horizon-mdp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_mdp","html_id":"tidy-mdp","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"policy","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_policy","html_id":"tidy-policy","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectories","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectory","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_traj","html_id":"tidy-traj","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"autoregressive_trajectories","html_id":"autoregressive-trajectories","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"value-functions","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"action_value","html_id":"action-value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"relating-the-value-function-and-action-value-function","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"greedy-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-one-step-bellman-consistency-equation","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency","html_id":"bellman-consistency","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_action","html_id":"bellman-consistency-action","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_det","html_id":"bellman-det","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"the-one-step-bellman-operator","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_operator","html_id":"bellman-operator","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"eval_dp","html_id":"eval-dp","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_finite","html_id":"tidy-eval-finite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_policy_finite","html_id":"optimal-policy-finite","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_greedy","html_id":"optimal-greedy","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_optimal","html_id":"bellman-consistency-optimal","kind":"proof:corollary","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_star_dp","html_id":"pi-star-dp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"discounted-rewards","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"stationary-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value-functions-and-bellman-consistency","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency_infinite","html_id":"bellman-consistency-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"solving-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-bellman-operator-is-a-contraction-mapping","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"contraction","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"contraction_convergence","html_id":"contraction-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_convergence","html_id":"bellman-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_contraction","html_id":"bellman-contraction","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy-evaluation-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"matrix-inversion-for-deterministic-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"tidy_tabular","html_id":"tidy-tabular","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"matrix_inversion_pe","html_id":"matrix-inversion-pe","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_infinite","html_id":"tidy-eval-infinite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"iterative_pe","html_id":"iterative-pe","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"iterations_vi","html_id":"iterations-vi","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal-policies-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"optimal_policy_infinite","html_id":"optimal-policy-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality","html_id":"bellman-optimality","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality_operator","html_id":"bellman-optimality-operator","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"value_iteration","html_id":"value-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"greedy_worsen","html_id":"greedy-worsen","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy_iteration","html_id":"policy-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_analysis","html_id":"pi-iter-analysis","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_proof","html_id":"pi-iter-proof","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"summary","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"kind":"page","data":"/control.json","url":"/control"},{"identifier":"introduction","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"control_examples","html_id":"control-examples","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"robot_hand","html_id":"robot-hand","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"cart_pole","html_id":"cart-pole","kind":"proof:example","data":"/control.json","url":"/control"},{"identifier":"optimal-control","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"optimal_control","html_id":"optimal-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"a-first-attempt-discretization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"lqr_definition","html_id":"lqr-definition","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"value_lqr","html_id":"value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_lqr","html_id":"optimal-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr","html_id":"optimal-value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr_quadratic","html_id":"optimal-value-lqr-quadratic","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"optimal_policy_lqr_linear","html_id":"optimal-policy-lqr-linear","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"lemma_pi_linear","html_id":"lemma-pi-linear","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"k_pi","html_id":"k-pi","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"riccati","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"lemma_schur","html_id":"lemma-schur","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"expected-state-at-time-hi","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"expected_state","html_id":"expected-state","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"extensions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"time_dep_lqr","html_id":"time-dep-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"time_dependent_lqr","html_id":"time-dependent-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"riccati_time_dependent","html_id":"riccati-time-dependent","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"more-general-quadratic-cost-functions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"general_quadratic_cost","html_id":"general-quadratic-cost","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"tracking-a-predefined-trajectory","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"approx_nonlinear","html_id":"approx-nonlinear","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"nonlinear_control","html_id":"nonlinear-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"local-linearization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"finite-differencing","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local-convexification","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local_linearization","html_id":"local-linearization","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"iterative_lqr","html_id":"iterative-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"ilqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"summary","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"kind":"page","data":"/bandits.json","url":"/bandits"},{"identifier":"introduction","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"advertising","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"clinical_trials","html_id":"clinical-trials","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"multi-armed","kind":"proof:remark","data":"/bandits.json","url":"/bandits"},{"identifier":"regret","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-exploration-random-guessing","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_exploration","html_id":"pure-exploration","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-code","html_id":"pure-exploration-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-output","html_id":"pure-exploration-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_greedy","html_id":"pure-greedy","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-code","html_id":"pure-greedy-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-output","html_id":"pure-greedy-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"etc","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"etc-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"exploration-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"exploitation-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"hoeffding","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"hoeffding-etc","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"union_bound","html_id":"union-bound","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"epsilon-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ucb-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"lower-bound-on-regret-intuition","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"thompson_sampling","html_id":"thompson-sampling","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"bayesian_bernoulli","html_id":"bayesian-bernoulli","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"contextual-bandits","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"contextual_bandit","html_id":"contextual-bandit","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"lin_ucb","html_id":"lin-ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ols_bandit","html_id":"ols-bandit","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"chebyshev","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"summary","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"kind":"page","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"introduction","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"parameterized_empirical_risk_minimization","html_id":"parameterized-empirical-risk-minimization","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"gd_def","html_id":"gd-def","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"linear-regression","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"neural-networks","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"kind":"page","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"introduction","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"erm","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"conditional_expectation_minimizes_mse","html_id":"conditional-expectation-minimizes-mse","kind":"proof:theorem","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"empirical_risk_minimization","html_id":"empirical-risk-minimization","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted-value-iteration","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"fitted_q_iteration","html_id":"fitted-q-iteration","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted_evaluation","html_id":"fitted-evaluation","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"summary","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"kind":"page","data":"/pg.json","url":"/pg"},{"identifier":"introduction","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"policy-stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"objective_fn","html_id":"objective-fn","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"parameterizations","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"tabular-representation","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"linear-in-features","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"neural-policies","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"continuous-action-spaces","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"importance_sampling","html_id":"importance-sampling","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"the-reinforce-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"trajectory_likelihood","html_id":"trajectory-likelihood","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"reinforce_pg","html_id":"reinforce-pg","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_with_q","html_id":"pg-with-q","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"baselines-and-advantages","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"eq:pg_baseline","html_id":"eq-pg-baseline","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_advantage","html_id":"pg-advantage","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_baseline","html_id":"pg-baseline","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"pdl","kind":"proof:theorem","data":"/pg.json","url":"/pg"},{"identifier":"pdl_eq","html_id":"pdl-eq","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"trust-region-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"kld","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"trpo","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"npg_optimization","html_id":"npg-optimization","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"fisher_matrix","html_id":"fisher-matrix","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"fisher_trajectory","html_id":"fisher-trajectory","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"npg","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural_simple","html_id":"natural-simple","kind":"proof:example","data":"/pg.json","url":"/pg"},{"identifier":"proximal-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"summary","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"kind":"page","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"introduction","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral-cloning","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral_cloning","html_id":"behavioral-cloning","kind":"proof:definition","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"distribution-shift","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"dataset-aggregation-dagger","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"kind":"page","data":"/planning.json","url":"/planning"},{"identifier":"introduction","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"deterministic-zero-sum-fully-observable-two-player-games","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"notation","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"min-max-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"min-max-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"alpha-beta-search","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"alpha-beta-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"monte-carlo-tree-search","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-algorithm","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"value-approximation","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-policy-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree-policy","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"self-play","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-self-play","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"references","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"kind":"page","data":"/exploration.json","url":"/exploration"},{"identifier":"introduction","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"per_episode_regret","html_id":"per-episode-regret","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"sparse-reward","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"sparse_reward_mdp","html_id":"sparse-reward-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"exploration-in-deterministic-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"explore_then_exploit","html_id":"explore-then-exploit","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"explore_then_exploit_performance","html_id":"explore-then-exploit-performance","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_mab","html_id":"mdp-mab","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_as_mab","html_id":"mdp-as-mab","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ineffective_mdp","html_id":"ineffective-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"modelling-the-transitions","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"reward-bonus","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"eq:ucb_vi_bonus","html_id":"eq-ucb-vi-bonus","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb_vi_bonus","html_id":"ucb-vi-bonus","kind":"proof:remark","data":"/exploration.json","url":"/exploration"},{"identifier":"err","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"definition","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb-vi-alg","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"performance-of-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb_vi_regret","html_id":"ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"linear-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"linear_mdp","html_id":"linear-mdp","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"planning-in-a-linear-mdp","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi","html_id":"lin-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"performance","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi_regret","html_id":"lin-ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"summary","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"kind":"page","data":"/background.json","url":"/background"},{"identifier":"o-notation","kind":"heading","data":"/background.json","url":"/background","implicit":true},{"identifier":"python","kind":"heading","data":"/background.json","url":"/background","implicit":true}]} \ No newline at end of file +{"version":"1","myst":"1.3.7","references":[{"kind":"page","data":"/index.json","url":"/"},{"identifier":"prerequisites","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"reinforcement-learning-in-a-nutshell","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"core-tasks-of-reinforcement-learning","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"course-overview","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"notation","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"programming","kind":"heading","data":"/index.json","url":"/"},{"kind":"page","data":"/mdps.json","url":"/mdps"},{"identifier":"introduction","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"markov","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"definition","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"finite_horizon_mdp","html_id":"finite-horizon-mdp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_mdp","html_id":"tidy-mdp","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"policy","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_policy","html_id":"tidy-policy","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectories","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectory","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_traj","html_id":"tidy-traj","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"autoregressive_trajectories","html_id":"autoregressive-trajectories","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"value-functions","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"action_value","html_id":"action-value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"relating-the-value-function-and-action-value-function","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"greedy-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-one-step-bellman-consistency-equation","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency","html_id":"bellman-consistency","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_action","html_id":"bellman-consistency-action","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_det","html_id":"bellman-det","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"the-one-step-bellman-operator","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_operator","html_id":"bellman-operator","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"eval_dp","html_id":"eval-dp","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_finite","html_id":"tidy-eval-finite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_policy_finite","html_id":"optimal-policy-finite","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_greedy","html_id":"optimal-greedy","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_optimal","html_id":"bellman-consistency-optimal","kind":"proof:corollary","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_star_dp","html_id":"pi-star-dp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"discounted-rewards","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"stationary-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value-functions-and-bellman-consistency","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency_infinite","html_id":"bellman-consistency-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"solving-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-bellman-operator-is-a-contraction-mapping","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"contraction","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"contraction_convergence","html_id":"contraction-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_convergence","html_id":"bellman-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_contraction","html_id":"bellman-contraction","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy-evaluation-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"matrix-inversion-for-deterministic-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"tidy_tabular","html_id":"tidy-tabular","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"matrix_inversion_pe","html_id":"matrix-inversion-pe","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_infinite","html_id":"tidy-eval-infinite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"iterative_pe","html_id":"iterative-pe","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"iterations_vi","html_id":"iterations-vi","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal-policies-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"optimal_policy_infinite","html_id":"optimal-policy-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality","html_id":"bellman-optimality","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality_operator","html_id":"bellman-optimality-operator","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"value_iteration","html_id":"value-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"greedy_worsen","html_id":"greedy-worsen","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy_iteration","html_id":"policy-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_analysis","html_id":"pi-iter-analysis","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_proof","html_id":"pi-iter-proof","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"summary","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"kind":"page","data":"/control.json","url":"/control"},{"identifier":"introduction","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"control_examples","html_id":"control-examples","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"robot_hand","html_id":"robot-hand","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"cart_pole","html_id":"cart-pole","kind":"proof:example","data":"/control.json","url":"/control"},{"identifier":"optimal-control","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"optimal_control","html_id":"optimal-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"a-first-attempt-discretization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"lqr_definition","html_id":"lqr-definition","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"value_lqr","html_id":"value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_lqr","html_id":"optimal-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr","html_id":"optimal-value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr_quadratic","html_id":"optimal-value-lqr-quadratic","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"optimal_policy_lqr_linear","html_id":"optimal-policy-lqr-linear","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"lemma_pi_linear","html_id":"lemma-pi-linear","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"k_pi","html_id":"k-pi","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"riccati","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"lemma_schur","html_id":"lemma-schur","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"expected-state-at-time-hi","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"expected_state","html_id":"expected-state","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"extensions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"time_dep_lqr","html_id":"time-dep-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"time_dependent_lqr","html_id":"time-dependent-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"riccati_time_dependent","html_id":"riccati-time-dependent","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"more-general-quadratic-cost-functions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"general_quadratic_cost","html_id":"general-quadratic-cost","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"tracking-a-predefined-trajectory","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"approx_nonlinear","html_id":"approx-nonlinear","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"nonlinear_control","html_id":"nonlinear-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"local-linearization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"finite-differencing","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local-convexification","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local_linearization","html_id":"local-linearization","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"iterative_lqr","html_id":"iterative-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"ilqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"summary","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"kind":"page","data":"/bandits.json","url":"/bandits"},{"identifier":"introduction","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"advertising","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"clinical_trials","html_id":"clinical-trials","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"multi-armed","kind":"proof:remark","data":"/bandits.json","url":"/bandits"},{"identifier":"regret","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-exploration-random-guessing","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_exploration","html_id":"pure-exploration","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-code","html_id":"pure-exploration-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-output","html_id":"pure-exploration-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_greedy","html_id":"pure-greedy","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-code","html_id":"pure-greedy-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-output","html_id":"pure-greedy-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"etc","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"etc-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"exploration-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"exploitation-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"hoeffding","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"hoeffding-etc","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"union_bound","html_id":"union-bound","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"epsilon-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ucb-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"lower-bound-on-regret-intuition","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"thompson_sampling","html_id":"thompson-sampling","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"bayesian_bernoulli","html_id":"bayesian-bernoulli","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"contextual-bandits","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"contextual_bandit","html_id":"contextual-bandit","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"lin_ucb","html_id":"lin-ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ols_bandit","html_id":"ols-bandit","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"chebyshev","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"summary","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"kind":"page","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"introduction","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"parameterized_empirical_risk_minimization","html_id":"parameterized-empirical-risk-minimization","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"gd_def","html_id":"gd-def","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"linear-regression","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"neural-networks","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"kind":"page","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"introduction","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"erm","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"conditional_expectation_minimizes_mse","html_id":"conditional-expectation-minimizes-mse","kind":"proof:theorem","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"empirical_risk_minimization","html_id":"empirical-risk-minimization","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted-value-iteration","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"fitted_q_iteration","html_id":"fitted-q-iteration","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted_evaluation","html_id":"fitted-evaluation","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"summary","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"kind":"page","data":"/pg.json","url":"/pg"},{"identifier":"introduction","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"policy-stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"objective_fn","html_id":"objective-fn","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"parameterizations","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"tabular-representation","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"linear-in-features","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"neural-policies","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"continuous-action-spaces","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"importance_sampling","html_id":"importance-sampling","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"the-reinforce-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"trajectory_likelihood","html_id":"trajectory-likelihood","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"reinforce_pg","html_id":"reinforce-pg","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_with_q","html_id":"pg-with-q","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"baselines-and-advantages","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"eq:pg_baseline","html_id":"eq-pg-baseline","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_advantage","html_id":"pg-advantage","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_baseline","html_id":"pg-baseline","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"pdl","kind":"proof:theorem","data":"/pg.json","url":"/pg"},{"identifier":"pdl_eq","html_id":"pdl-eq","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"trust-region-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"kld","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"trpo","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"npg_optimization","html_id":"npg-optimization","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"fisher_matrix","html_id":"fisher-matrix","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"fisher_trajectory","html_id":"fisher-trajectory","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"npg","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural_simple","html_id":"natural-simple","kind":"proof:example","data":"/pg.json","url":"/pg"},{"identifier":"proximal-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"summary","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"kind":"page","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"introduction","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral-cloning","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral_cloning","html_id":"behavioral-cloning","kind":"proof:definition","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"distribution-shift","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"dataset-aggregation-dagger","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"kind":"page","data":"/planning.json","url":"/planning"},{"identifier":"introduction","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"deterministic-zero-sum-fully-observable-two-player-games","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"notation","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"min-max-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"min-max-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"min-max-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"complexity-of-min-max-search","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"alpha-beta-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"alpha-beta-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"monte-carlo-tree-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"mcts-algorithm","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"incorporating-value-functions-and-policies","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-policy-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree-policy","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"self-play","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-self-play","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"summary","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"references","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"kind":"page","data":"/exploration.json","url":"/exploration"},{"identifier":"introduction","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"per_episode_regret","html_id":"per-episode-regret","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"sparse-reward","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"sparse_reward_mdp","html_id":"sparse-reward-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"exploration-in-deterministic-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"explore_then_exploit","html_id":"explore-then-exploit","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"explore_then_exploit_performance","html_id":"explore-then-exploit-performance","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_mab","html_id":"mdp-mab","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_as_mab","html_id":"mdp-as-mab","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ineffective_mdp","html_id":"ineffective-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"modelling-the-transitions","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"reward-bonus","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"eq:ucb_vi_bonus","html_id":"eq-ucb-vi-bonus","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb_vi_bonus","html_id":"ucb-vi-bonus","kind":"proof:remark","data":"/exploration.json","url":"/exploration"},{"identifier":"err","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"definition","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb-vi-alg","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"performance-of-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb_vi_regret","html_id":"ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"linear-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"linear_mdp","html_id":"linear-mdp","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"planning-in-a-linear-mdp","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi","html_id":"lin-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"performance","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi_regret","html_id":"lin-ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"summary","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"kind":"page","data":"/background.json","url":"/background"},{"identifier":"o-notation","kind":"heading","data":"/background.json","url":"/background","implicit":true},{"identifier":"python","kind":"heading","data":"/background.json","url":"/background","implicit":true}]} \ No newline at end of file diff --git a/objects.inv b/objects.inv index 17b9975..b813ea1 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/pg.html b/pg.html index 11c3b17..928e4d7 100644 --- a/pg.html +++ b/pg.html @@ -1,4 +1,4 @@ -6 Policy Optimization - CS/STAT 184: Introduction to Reinforcement Learning

6 Policy Gradient Methods

6.1Introduction

The core task of RL is finding the optimal policy in a given environment. This is essentially an optimization problem: out of some space of policies, we want to find the one that achieves the maximum total reward (in expectation).

It’s typically intractable to compute the optimal policy exactly. @@ -34,13 +34,13 @@ many of which use policies parameterized as deep neural networks.

  1. We begin the chapter with a short review of gradient ascent, a general optimization method.
  2. We’ll then see how to estimate the policy gradient, enabling us to apply (stochastic) gradient ascent in the RL setting.
  3. Then we’ll explore some proximal optimization techniques that ensure the steps taken are “not too large”. -This is helpful to stabilize training and widely used in practice.
from utils import plt, Array, Callable, jax, jnp

6.2Gradient Ascent

Gradient ascent is a general optimization algorithm for any differentiable function. +This is helpful to stabilize training and widely used in practice.

from utils import plt, Array, Callable, jax, jnp

6.2Gradient Ascent

Gradient ascent is a general optimization algorithm for any differentiable function. A suitable analogy for this algorithm is hiking up a mountain, where you keep taking steps in the steepest direction upwards. Here, your vertical position yy is the function being optimized, and your horizontal position (x,z)(x, z) is the input to the function. The slope of the mountain at your current position is given by the gradient, -written y(x,z)R2\nabla y(x, z) \in \mathbb{R}^2.

def f(x, y):
+written y(x,z)R2\nabla y(x, z) \in \mathbb{R}^2.

def f(x, y):
     """Himmelblau's function"""
     return (x**2 + y - 11)**2 + (x + y**2 - 7)**2
 
@@ -72,13 +72,13 @@
 # Add plot title
 ax.set_title("Himmelblau's Function")
 
-plt.show()
<Figure size 600x600 with 2 Axes>

For differentiable functions, this can be thought of as the vector of partial derivatives,

y(x,z)=(yxyz).\nabla y(x, z) = \begin{pmatrix} +plt.show()
<Figure size 600x600 with 2 Axes>

For differentiable functions, this can be thought of as the vector of partial derivatives,

y(x,z)=(yxyz).\nabla y(x, z) = \begin{pmatrix} \frac{\partial y}{\partial x} \\ \frac{\partial y}{\partial z} -\end{pmatrix}.

To calculate the slope (aka “directional derivative”) of the mountain in a given direction (Δx,Δz)(\Delta x, \Delta z), +\end{pmatrix}.

To calculate the slope (aka “directional derivative”) of the mountain in a given direction (Δx,Δz)(\Delta x, \Delta z), you take the dot product of the difference vector with the gradient. This means that the direction with the highest slope is exactly the gradient itself, -so we can describe the gradient ascent algorithm as follows:

where kk denotes the iteration of the algorithm and η>0\eta > 0 is a “step size” hyperparameter that controls the size of the steps we take. +\eta \nabla y(x^{k}, z^{k})

where kk denotes the iteration of the algorithm and η>0\eta > 0 is a “step size” hyperparameter that controls the size of the steps we take. (Note that we could also vary the step size across iterations, that is, η0,,ηK\eta^0, \dots, \eta^K.)

The case of a two-dimensional input is easy to visualize. But this idea can be straightforwardly extended to higher-dimensional inputs.

From now on, we’ll use JJ to denote the function we’re trying to maximize, and θ to denote the parameters being optimized over. (In the above example, θ=(xz)\theta = \begin{pmatrix} x & z \end{pmatrix}^\top).

Notice that our parameters will stop changing once J(θ)=0.\nabla J(\theta) = 0. @@ -98,8 +98,8 @@ the computer applies a list of rules to transform the symbols involved. Python’s sympy package supports symbolic differentiation. However, functions implemented in code may not always have a straightforward symbolic representation.

Another way is numerical differentiation, -which is based on the limit definition of a (directional) derivative:

uJ(x)=limε0J(x+εu)J(x)ε\nabla_{\boldsymbol{u}} J(\boldsymbol{x}) = \lim_{\varepsilon \to 0} -\frac{J(\boldsymbol{x} + \varepsilon \boldsymbol{u}) - J(\boldsymbol{x})}{\varepsilon}

Then, we can substitute a small value of ε\varepsilon on the r.h.s. to approximate the directional derivative. +which is based on the limit definition of a (directional) derivative:

uJ(x)=limε0J(x+εu)J(x)ε\nabla_{\boldsymbol{u}} J(\boldsymbol{x}) = \lim_{\varepsilon \to 0} +\frac{J(\boldsymbol{x} + \varepsilon \boldsymbol{u}) - J(\boldsymbol{x})}{\varepsilon}

Then, we can substitute a small value of ε\varepsilon on the r.h.s. to approximate the directional derivative. How small, though? If we need an accurate estimate, we may need such a small value of ε\varepsilon that typical computers will run into rounding errors. Also, to compute the full gradient, @@ -111,13 +111,13 @@ we execute them on the values when the function gets called, like in numerical differentiation. This allows us to differentiate through programming constructs such as branches or loops, -and doesn’t involve any arbitrarily small values.

6.2.1Stochastic gradient ascent

In real applications, +and doesn’t involve any arbitrarily small values.

6.2.1Stochastic gradient ascent

In real applications, computing the gradient of the target function is not so simple. As an example from supervised learning, J(θ)J(\theta) might be the sum of squared prediction errors across an entire training dataset. However, if our dataset is very large, it might not fit into our computer’s memory! In these cases, we often compute some estimate of the gradient at each step, ~J(θ)\tilde \nabla J(\theta), and walk in that direction instead. This is called stochastic gradient ascent. -In the SL example above, we might randomly choose a minibatch of samples and use them to estimate the true prediction error. (This approach is known as minibatch SGD.)

def sgd(
+In the SL example above, we might randomly choose a minibatch of samples and use them to estimate the true prediction error. (This approach is known as minibatch SGD.)

def sgd(
     θ_init: Array,
     estimate_gradient: Callable[[Array], Array],
     η: float,
@@ -130,8 +130,8 @@
     θ = θ_init
     for step in range(n_steps):
         θ += η * estimate_gradient(θ)
-    return θ

What makes one gradient estimator better than another? -Ideally, we want this estimator to be unbiased; that is, on average, it matches a single true gradient step:

E[~J(θ)]=J(θ).\E [\tilde \nabla J(\theta)] = \nabla J(\theta).

We also want the variance of the estimator to be low so that its performance doesn’t change drastically at each step.

We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a θ that is “close” to a stationary point. + return θ

What makes one gradient estimator better than another? +Ideally, we want this estimator to be unbiased; that is, on average, it matches a single true gradient step:

E[~J(θ)]=J(θ).\E [\tilde \nabla J(\theta)] = \nabla J(\theta).

We also want the variance of the estimator to be low so that its performance doesn’t change drastically at each step.

We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a θ that is “close” to a stationary point. In another perspective, for such functions, the local “landscape” of JJ around θ becomes flatter and flatter the longer we run SGD.

We’ll now see a concrete application of gradient ascent in the context of policy optimization.

6.3Policy (stochastic) gradient ascent

Remember that in RL, the primary goal is to find the optimal policy that achieves the maximimum total reward, which we can express using the value function we defined in Definition 1.6:

J(π):=Es0μ0Vπ(s0)=Eh=0H1rhwheres0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).\begin{aligned} +and the norm of the gradient estimator has a bounded second moment σ2,\sigma^2,

J(θK)2O(Mβσ2/K).\|\nabla J(\theta^K)\|^2 \le O \left( M \beta \sigma^2 / K\right).

We call a function β-smooth if its gradient is Lipschitz continuous with constant β:

J(θ)J(θ)βθθ.\|\nabla J(\theta) - \nabla J(\theta')\| \le \beta \|\theta - \theta'\|.

We’ll now see a concrete application of gradient ascent in the context of policy optimization.

6.3Policy (stochastic) gradient ascent

Remember that in RL, the primary goal is to find the optimal policy that achieves the maximimum total reward, which we can express using the value function we defined in Definition 1.6:

J(π):=Es0μ0Vπ(s0)=Eh=0H1rhwheres0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).\begin{aligned} J(\pi) := \E_{s_0 \sim \mu_0} V^{\pi} (s_0) = & \E \sum_{\hi=0}^{\hor-1} r_\hi \\ \text{where} \quad & s_0 \sim \mu_0 \\ & s_{t+1} \sim P(s_\hi, a_\hi), \\ @@ -155,14 +155,14 @@ \end{aligned}

(Note that we’ll continue to work in the undiscounted, finite-horizon case. Analogous results hold for the discounted, infinite-horizon case.)

As shown by the notation, this is exactly the function JJ that we want to maximize using gradient ascent. What does θ correspond to, though? In general, π is a function, and optimizing over the space of arbitrary input-output mappings would be intractable. -Instead, we need to describe π in terms of some finite set of parameters θ.

6.3.1Example policy parameterizations

What are some ways we could parameterize our policy?

6.3.1.1Tabular representation

If both the state and action spaces are finite, perhaps we could simply learn a preference value θs,a\theta_{s,a} for each state-action pair. +Instead, we need to describe π in terms of some finite set of parameters θ.

6.3.1Example policy parameterizations

What are some ways we could parameterize our policy?

6.3.1.1Tabular representation

If both the state and action spaces are finite, perhaps we could simply learn a preference value θs,a\theta_{s,a} for each state-action pair. Then to turn this into a valid distribution, we perform a softmax operation: we exponentiate each of them, -and then normalize to form a valid distribution:

πθsoftmax(as)=exp(θs,a)s,aexp(θs,a).\pi^\text{softmax}_\theta(a | s) = \frac{\exp(\theta_{s,a})}{\sum_{s,a'} \exp (\theta_{s,a'})}.

However, this doesn’t make use of any structure in the states or actions, -so while this is flexible, it is also prone to overfitting.

6.3.1.2Linear in features

Another approach is to map each state-action pair into some feature space ϕ(s,a)Rp\phi(s, a) \in \mathbb{R}^p. Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:

πθlinear in features(as)=exp(θϕ(s,a))aexp(θϕ(s,a)).\pi^\text{linear in features}_{\theta}(a|s) = \frac{\exp(\theta^\top \phi(s, a))}{\sum_{a'} \exp(\theta^\top \phi(s, a'))}.

Another interpretation is that θ represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with θ are given higher probability.

The score function for this parameterization is also quite elegant:

logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)\begin{aligned} +and then normalize to form a valid distribution:

πθsoftmax(as)=exp(θs,a)s,aexp(θs,a).\pi^\text{softmax}_\theta(a | s) = \frac{\exp(\theta_{s,a})}{\sum_{s,a'} \exp (\theta_{s,a'})}.

However, this doesn’t make use of any structure in the states or actions, +so while this is flexible, it is also prone to overfitting.

6.3.1.2Linear in features

Another approach is to map each state-action pair into some feature space ϕ(s,a)Rp\phi(s, a) \in \mathbb{R}^p. Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:

πθlinear in features(as)=exp(θϕ(s,a))aexp(θϕ(s,a)).\pi^\text{linear in features}_{\theta}(a|s) = \frac{\exp(\theta^\top \phi(s, a))}{\sum_{a'} \exp(\theta^\top \phi(s, a'))}.

Another interpretation is that θ represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with θ are given higher probability.

The score function for this parameterization is also quite elegant:

logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)\begin{aligned} \nabla \log \pi_\theta(a|s) &= \nabla \left( \theta^\top \phi(s, a) - \log \left( \sum_{a'} \exp(\theta^\top \phi(s, a')) \right) \right) \\ &= \phi(s, a) - \E_{a' \sim \pi_\theta(s)} \phi(s, a') -\end{aligned}

Plugging this into our policy gradient expression, we get

J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]\begin{aligned} +\end{aligned}

Plugging this into our policy gradient expression, we get

J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]\begin{aligned} \nabla J(\theta) & = \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla \log \pi_\theta(a_\hi | s_\hi) A_\hi^{\pi_\theta} \right] \\ @@ -170,10 +170,10 @@ \sum_{t=0}^{T-1} \left( \phi(s_\hi, a_\hi) - \E_{a' \sim \pi(s_\hi)} \phi(s_\hi, a') \right) A_\hi^{\pi_\theta}(s_\hi, a_\hi) \right] \\ & = \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \phi(s_\hi, a_\hi) A_\hi^{\pi_\theta} (s_\hi, a_\hi) \right] -\end{aligned}

Why can we drop the Eϕ(sh,a)\E \phi(s_\hi, a') term? By linearity of expectation, consider the dropped term at a single timestep: Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].\E_{\tau \sim \rho_\theta} \left[ \left( \E_{a' \sim \pi(s_\hi)} \phi(s, a') \right) A_\hi^{\pi_\theta}(s_\hi, a_\hi) \right]. By Adam’s Law, we can wrap the advantage term in a conditional expectation on the state sh.s_\hi. Then we already know that Eaπ(s)Ahπ(s,a)=0,\E_{a \sim \pi(s)} A_\hi^{\pi}(s, a) = 0, and so this entire term vanishes.

6.3.1.3Neural policies

More generally, we could map states and actions to unnormalized scores via some parameterized function fθ:S×AR,f_\theta : \mathcal{S} \times \mathcal{A} \to \mathbb{R}, such as a neural network, and choose actions according to a softmax:

πθgeneral(as)=exp(fθ(s,a))aexp(fθ(s,a)).\pi^\text{general}_\theta(a|s) = \frac{\exp(f_{\theta}(s,a))}{\sum_{a'} \exp(f_{\theta}(s,a'))}.

The score can then be written as

logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)\nabla \log \pi_\theta(a|s) = \nabla f_\theta(s, a) - \E_{a \sim \pi_\theta(s)} \nabla f_\theta (s, a')

6.3.2Continuous action spaces

Consider a continuous nn-dimensional action space A=Rn\mathcal{A} = \mathbb{R}^n. Then for a stochastic policy, we could use a function to predict the mean action and then add some random noise about it. For example, we could use a neural network to predict the mean action μθ(s)\mu_\theta(s) and then add some noise ϵN(0,σ2I)\epsilon \sim \mathcal{N}(0, \sigma^2 I) to it:

πθ(as)=N(μθ(s),σ2I).\pi_\theta(a|s) = \mathcal{N}(\mu_\theta(s), \sigma^2 I).

Now that we have seen parameterized policies, we can now write the total reward in terms of the parameters:

J(θ)=EτρθR(τ).J(\theta) = \E_{\tau \sim \rho_\theta} R(\tau).

Now how do we maximize this function (the expected total reward) over the parameters? -One simple idea would be to directly apply gradient ascent:

θk+1=θk+ηJ(θk).\theta^{k+1} = \theta^k + \eta \nabla J(\theta^k).

In order to apply this technique, we need to be able to evaluate the gradient J(θ).\nabla J(\theta). +\end{aligned}

Why can we drop the Eϕ(sh,a)\E \phi(s_\hi, a') term? By linearity of expectation, consider the dropped term at a single timestep: Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].\E_{\tau \sim \rho_\theta} \left[ \left( \E_{a' \sim \pi(s_\hi)} \phi(s, a') \right) A_\hi^{\pi_\theta}(s_\hi, a_\hi) \right]. By Adam’s Law, we can wrap the advantage term in a conditional expectation on the state sh.s_\hi. Then we already know that Eaπ(s)Ahπ(s,a)=0,\E_{a \sim \pi(s)} A_\hi^{\pi}(s, a) = 0, and so this entire term vanishes.

6.3.1.3Neural policies

More generally, we could map states and actions to unnormalized scores via some parameterized function fθ:S×AR,f_\theta : \mathcal{S} \times \mathcal{A} \to \mathbb{R}, such as a neural network, and choose actions according to a softmax:

πθgeneral(as)=exp(fθ(s,a))aexp(fθ(s,a)).\pi^\text{general}_\theta(a|s) = \frac{\exp(f_{\theta}(s,a))}{\sum_{a'} \exp(f_{\theta}(s,a'))}.

The score can then be written as

logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)\nabla \log \pi_\theta(a|s) = \nabla f_\theta(s, a) - \E_{a \sim \pi_\theta(s)} \nabla f_\theta (s, a')

6.3.2Continuous action spaces

Consider a continuous nn-dimensional action space A=Rn\mathcal{A} = \mathbb{R}^n. Then for a stochastic policy, we could use a function to predict the mean action and then add some random noise about it. For example, we could use a neural network to predict the mean action μθ(s)\mu_\theta(s) and then add some noise ϵN(0,σ2I)\epsilon \sim \mathcal{N}(0, \sigma^2 I) to it:

πθ(as)=N(μθ(s),σ2I).\pi_\theta(a|s) = \mathcal{N}(\mu_\theta(s), \sigma^2 I).

Now that we have seen parameterized policies, we can now write the total reward in terms of the parameters:

J(θ)=EτρθR(τ).J(\theta) = \E_{\tau \sim \rho_\theta} R(\tau).

Now how do we maximize this function (the expected total reward) over the parameters? +One simple idea would be to directly apply gradient ascent:

θk+1=θk+ηJ(θk).\theta^{k+1} = \theta^k + \eta \nabla J(\theta^k).

In order to apply this technique, we need to be able to evaluate the gradient J(θ).\nabla J(\theta). But J(θ)J(\theta) is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories τ.\tau. -Can we rewrite it in a form that’s more convenient to implement?

6.3.3Importance Sampling

There is a general trick called importance sampling for evaluating such expectations. +Can we rewrite it in a form that’s more convenient to implement?

6.3.3Importance Sampling

There is a general trick called importance sampling for evaluating such expectations. Suppose we want to estimate Exp[f(x)]\E_{x \sim p}[f(x)] where pp is hard or expensive to sample from. We can, however, evaluate the likelihood p(x)p(x). Suppose that we can sample from a different distribution qq. Since an expectation is just a weighted average, we can sample xx from qq, compute f(x)f(x), and then reweight the results: @@ -181,16 +181,16 @@ we should boost its weighting, and if it is common under qq but uncommon under pp, we should lower its weighting. -The reweighting factor is exactly the likelihood ratio between the target distribution pp and the sampling distribution qq:

Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\E_{x \sim p}[f(x)] = \sum_{x \in \mathcal{X}} f(x) p(x) = \sum_{x \in \mathcal{X}} f(x) \frac{p(x)}{q(x)} q(x) = \E_{x \sim q} \left[ \frac{p(x)}{q(x)} f(x) \right].

Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate any expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term. +The reweighting factor is exactly the likelihood ratio between the target distribution pp and the sampling distribution qq:

Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\E_{x \sim p}[f(x)] = \sum_{x \in \mathcal{X}} f(x) p(x) = \sum_{x \in \mathcal{X}} f(x) \frac{p(x)}{q(x)} q(x) = \E_{x \sim q} \left[ \frac{p(x)}{q(x)} f(x) \right].

Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate any expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term. If there are values of xx that are very rare in the sampling distribution qq, but common under pp, then the likelihood ratio p(x)/q(x)p(x)/q(x) will cause the variance to blow up.

6.4The REINFORCE policy gradient

Returning to RL, suppose there is some trajectory distribution ρ(τ)\rho(\tau) that is easy to sample from, such as a database of existing trajectories. We can then rewrite J(θ)\nabla J(\theta), a.k.a. the policy gradient, as follows. -All gradients are being taken with respect to θ.

J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\begin{aligned} +All gradients are being taken with respect to θ.

J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\begin{aligned} \nabla J(\theta) & = \nabla \E_{\tau \sim \rho_\theta} [ R(\tau) ] \\ & = \nabla \E_{\tau \sim \rho} \left[ \frac{\rho_\theta(\tau)}{\rho(\tau)} R(\tau) \right] & & \text{likelihood ratio trick} \\ & = \E_{\tau \sim \rho} \left[ \frac{\nabla \rho_\theta(\tau)}{\rho(\tau)} R(\tau) \right] & & \text{switching gradient and expectation} -\end{aligned}

Note that for ρ=ρθ\rho = \rho_\theta, the inside term becomes

J(θ)=Eτρθ[logρθ(τ)R(τ)].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} [ \nabla \log \rho_\theta(\tau) \cdot R(\tau)].

(The order of operations is (logρθ)(τ)\nabla (\log \rho_\theta)(\tau).)

Note that when the state transitions are Markov (i.e. sts_{t} only depends on st1,at1s_{t-1}, a_{t-1}) and the policy is time-homogeneous (i.e. ahπθ(sh)a_\hi \sim \pi_\theta (s_\hi)), we can write out the likelihood of a trajectory under the policy πθ\pi_\theta:

ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).\begin{aligned} +\end{aligned}

Note that for ρ=ρθ\rho = \rho_\theta, the inside term becomes

J(θ)=Eτρθ[logρθ(τ)R(τ)].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} [ \nabla \log \rho_\theta(\tau) \cdot R(\tau)].

(The order of operations is (logρθ)(τ)\nabla (\log \rho_\theta)(\tau).)

Note that when the state transitions are Markov (i.e. sts_{t} only depends on st1,at1s_{t-1}, a_{t-1}) and the policy is time-homogeneous (i.e. ahπθ(sh)a_\hi \sim \pi_\theta (s_\hi)), we can write out the likelihood of a trajectory under the policy πθ\pi_\theta:

ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).\begin{aligned} \rho_\theta(\tau) &= \mu(s_0) \pi_\theta(a_0 | s_0) \\ &\qquad \times P(s_1 | s_0, a_0) \pi_\theta(a_1 | s_1) \\ &\qquad \times \cdots \\ @@ -202,7 +202,7 @@ \end{aligned}

This expression allows us to estimate the gradient by sampling a few sample trajectories from πθ,\pi_\theta, calculating the likelihoods of the chosen actions, and substituting these into the expression above. -We can then use this gradient estimate to apply stochastic gradient ascent.

def estimate_gradient_reinforce_pseudocode(env, π, θ):
+We can then use this gradient estimate to apply stochastic gradient ascent.

def estimate_gradient_reinforce_pseudocode(env, π, θ):
     τ = sample_trajectory(env, π(θ))
     gradient_hat = 0
     for s, a, r in τ:
@@ -215,10 +215,10 @@
 and that we only need to consider the present and future rewards to calculate the policy gradient:

J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]\begin{aligned} \nabla J(\theta) &= \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) \sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \right] \\ &= \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) Q^{\pi_\theta}(s_{t}, a_{t}) \right] -\end{aligned}

Exercise: Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?

For some intuition into how this method works, recall that we update our parameters according to

θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].\begin{aligned} +\end{aligned}

Exercise: Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?

For some intuition into how this method works, recall that we update our parameters according to

θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].\begin{aligned} \theta_{t+1} &= \theta_\hi + \eta \nabla J(\theta_\hi) \\ &= \theta_\hi + \eta \E_{\tau \sim \rho_{\theta_\hi}} [\nabla \log \rho_{\theta_\hi}(\tau) \cdot R(\tau)]. -\end{aligned}

Consider the “good” trajectories where R(τ)R(\tau) is large. Then θ gets updated so that these trajectories become more likely. To see why, recall that ρθ(τ)\rho_{\theta}(\tau) is the likelihood of the trajectory τ under the policy πθ,\pi_\theta, so evaluating the gradient points in the direction that makes τ more likely.

6.5Baselines and advantages

A central idea from supervised learning is the bias-variance decomposition, +\end{aligned}

Consider the “good” trajectories where R(τ)R(\tau) is large. Then θ gets updated so that these trajectories become more likely. To see why, recall that ρθ(τ)\rho_{\theta}(\tau) is the likelihood of the trajectory τ under the policy πθ,\pi_\theta, so evaluating the gradient points in the direction that makes τ more likely.

6.5Baselines and advantages

A central idea from supervised learning is the bias-variance decomposition, which shows that the mean squared error of an estimator is the sum of its squared bias and its variance. The REINFORCE gradient estimator (6.21) is already unbiased, meaning that its expectation over trajectories is the true policy gradient. Can we find ways to reduce its variance as well?

One common way is to subtract a baseline function bh:SRb_\hi : \mathcal{S} \to \mathbb{R} at each timestep h.\hi. This modifies the policy gradient as follows:

J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ @@ -228,7 +228,7 @@ \right) - b_\hi(s_\hi) \right) - \right].

For example, we might want bhb_\hi to estimate the average reward-to-go at a given timestep:

bhθ=EτρθRh(τ).b_\hi^\theta = \E_{\tau \sim \rho_\theta} R_\hi(\tau).

This way, the random variable Rh(τ)bhθR_\hi(\tau) - b_\hi^\theta is centered around zero, making certain algorithms more stable.

As a better baseline, we could instead choose the value function. + \right].

For example, we might want bhb_\hi to estimate the average reward-to-go at a given timestep:

bhθ=EτρθRh(τ).b_\hi^\theta = \E_{\tau \sim \rho_\theta} R_\hi(\tau).

This way, the random variable Rh(τ)bhθR_\hi(\tau) - b_\hi^\theta is centered around zero, making certain algorithms more stable.

As a better baseline, we could instead choose the value function. Note that the random variable Qhπ(s,a)Vhπ(s),Q^\pi_\hi(s, a) - V^\pi_\hi(s), where the randomness is taken over the actions, is also centered around zero. (Recall Vhπ(s)=EaπQhπ(s,a).V^\pi_\hi(s) = \E_{a \sim \pi} Q^\pi_\hi(s, a).) @@ -236,7 +236,7 @@ This measures how much better this action does than the average for that policy. (Note that for an optimal policy π,\pi^\star, the advantage of a given state-action pair is always zero or negative.)

We can now express the policy gradient as follows. Note that the advantage function effectively replaces the QQ-function from (6.22):

J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla \log \pi_\theta(a_\hi | s_\hi) A^{\pi_\theta}_\hi (s_\hi, a_\hi) -\right].

Note that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories:

6.6Comparing policy gradient algorithms to policy iteration

What advantages does the policy gradient algorithm have over Section 1.5.3.2?

To analyze the difference between them, we’ll make use of the performance difference lemma, which provides an expression for comparing the difference between two value functions.

6.6Comparing policy gradient algorithms to policy iteration

What advantages does the policy gradient algorithm have over Section 1.5.3.2?

To analyze the difference between them, we’ll make use of the performance difference lemma, which provides an expression for comparing the difference between two value functions.

The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting. +\end{aligned}

as desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)

The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting. To see why, let’s consider a single iteration of policy iteration, where policy π gets updated to π~\tilde \pi. We’ll assume these policies are deterministic. Suppose the new policy π~\tilde \pi chooses some action with a negative advantage with respect to π. That is, when acting according to π, taking the action from π~\tilde \pi would perform worse than expected. Define Δ\Delta_\infty to be the most negative advantage, that is, Δ=minsSAhπ(s,π~(s))\Delta_\infty = \min_{s \in \mathcal{S}} A^{\pi}_\hi(s, \tilde \pi(s)). -Plugging this into the Theorem 6.1 gives

V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\begin{aligned} +Plugging this into the Theorem 6.1 gives

V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\begin{aligned} V_0^{\tilde \pi}(s) - V_0^{\pi}(s) &= \E_{\tau \sim \rho_{\tilde \pi, s}} \left[ \sum_{\hi=0}^{\hor-1} A_\hi^{\pi}(s_\hi, a_\hi) \right] \\ &\ge H \Delta_\infty \\ V_0^{\tilde \pi}(s) &\ge V_0^{\pi}(s) - H|\Delta_\infty|. -\end{aligned}

That is, for some state ss, the lower bound on the performance of π~\tilde \pi is lower than the performance of π. +\end{aligned}

That is, for some state ss, the lower bound on the performance of π~\tilde \pi is lower than the performance of π. This doesn’t state that π~\tilde \pi will necessarily perform worse than π, only suggests that it might be possible. If these worst case states do exist, though, @@ -286,27 +286,27 @@ Then, by adjusting the parameters only a small distance, the new policy will also have a similar trajectory distribution. But this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth. -Can we constrain the distance between the resulting distributions more explicitly?

This brings us to the next three methods:

  • trust region policy optimization (TRPO), which explicitly constrains the difference between the distributions before and after each step;
  • the natural policy gradient (NPG), a first-order approximation of TRPO;
  • proximal policy optimization (PPO), a “soft relaxation” of TRPO.

6.7Trust region policy optimization

We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration. +Can we constrain the distance between the resulting distributions more explicitly?

This brings us to the next three methods:

  • trust region policy optimization (TRPO), which explicitly constrains the difference between the distributions before and after each step;
  • the natural policy gradient (NPG), a first-order approximation of TRPO;
  • proximal policy optimization (PPO), a “soft relaxation” of TRPO.

6.7Trust region policy optimization

We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration. Can we design an algorithm that explicitly constrains the “step size”? That is, we want to improve the policy as much as possible, measured in terms of the r.h.s. of the Theorem 6.1, -while ensuring that its trajectory distribution does not change too much:

θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\begin{aligned} +while ensuring that its trajectory distribution does not change too much:

θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\begin{aligned} \theta^{k+1} &\gets \arg\max_{\theta^{\text{opt}}} \E_{s_0, \dots, s_{H-1} \sim \pi^{k}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi^{\theta^\text{opt}}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] \\ & \text{where } \text{distance}(\rho_{\theta^{\text{opt}}}, \rho_{\theta^k}) < \delta -\end{aligned}

Note that we have made a small change to the r.h.s. expression: +\end{aligned}

Note that we have made a small change to the r.h.s. expression: we use the states sampled from the old policy, and only use the actions from the new policy. It would be computationally infeasible to sample entire trajectories from πθ\pi_\theta as we are optimizing over θ. On the other hand, if πθ\pi_\theta returns a vector representing a probability distribution over actions, then evaluating the expected advantage with respect to this distribution only requires taking a dot product. This approximation also matches the r.h.s. of the PDL to first order in θ. (We will elaborate more on this later.)

How do we describe the distance between ρθopt\rho_{\theta^{\text{opt}}} and ρθk\rho_{\theta^k}? -We’ll use the Kullback-Leibler divergence (KLD):

\ No newline at end of file diff --git a/pg.json b/pg.json index b32cc14..7f97b99 100644 --- a/pg.json +++ b/pg.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"4e3cf3b85afff638f23199a2495738ab6517dc3e476ebe23d4147abbdf58b4e7","slug":"pg","location":"/pg.md","dependencies":[],"frontmatter":{"title":"6 Policy Optimization","numbering":{"all":{"enabled":true},"enumerator":{"template":"6.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","thumbnailOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp","exports":[{"format":"md","filename":"pg.md","url":"/build/pg-2a9dbd794279ec500807a9217877d9ae.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"i7hRHLAqO5"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"6.1","key":"Jdy3c6287u"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"The core task of RL is finding the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"aI17qaKscA"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"vWweXdeERp"}],"key":"TPsu2CEeGe"},{"type":"text","value":" in a given environment.\nThis is essentially an ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"XOCMVU9Iou"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"optimization problem:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"di1zuz8NUU"}],"key":"VaDQKiFZzG"},{"type":"text","value":"\nout of some space of policies,\nwe want to find the one that achieves the maximum total reward (in expectation).","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"eE4vBUNe24"}],"key":"fLNfFeWp3U"},{"type":"paragraph","position":{"start":{"line":25,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"It’s typically intractable to compute the optimal policy exactly.\nInstead, ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"RiiHWm9UAm"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"policy optimization algorithms","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"FeQ7oscGzn"}],"key":"G5O9Q0Rju3"},{"type":"text","value":" start from some randomly initialized policy,\nand then ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"agwnqpOnle"},{"type":"emphasis","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"XWNPeUhmgE"}],"key":"ousn9OrUbF"},{"type":"text","value":" it step by step.\nWe’ve already seen some examples of these,\nnamely ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"bct73ecpoC"},{"type":"crossReference","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Section ","key":"PjvFlQ62qG"},{"type":"text","value":"1.5.3.2","key":"VIip0unISg"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"rxU4x1FAMq"},{"type":"text","value":" for finite MDPs and ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"dBOCVMkXqK"},{"type":"crossReference","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Section ","key":"aG0imJMI4g"},{"type":"text","value":"2.6.4","key":"QQO9ombVY8"}],"identifier":"iterative_lqr","label":"iterative_lqr","kind":"heading","template":"Section %s","enumerator":"2.6.4","resolved":true,"html_id":"iterative-lqr","remote":true,"url":"/control","dataUrl":"/control.json","key":"KxdAhaThnQ"},{"type":"text","value":" in continuous control.\nIn particular, we often use policies that can be described by some finite set of ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"FKOIZzGiUb"},{"type":"emphasis","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"parameters.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"MrjVVgf5J3"}],"key":"P9cfybVJDd"},{"type":"text","value":"\nFor such parameterized policies,\nwe can approximate the ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"bwvhYbJ4LQ"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"policy gradient:","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"fXQlqYcDxi"}],"key":"FKi9DNPMpK"},{"type":"text","value":"\nthe gradient of the expected total reward with respect to the parameters.\nThis tells us the direction the parameters should be updated to achieve a higher total reward (in expectation).\nPolicy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models,\nmany of which use policies parameterized as deep neural networks.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"A9Qh7Dhvzr"}],"key":"ooUvEnJUVh"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":38,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":38,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"text","value":"We begin the chapter with a short review of gradient ascent,\na general ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"SlW3b0NC6h"},{"type":"strong","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"optimization method.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"NwCuRkfQQ7"}],"key":"TMY5iG25Hj"}],"key":"vSMoEgpfzV"},{"type":"listItem","spread":true,"position":{"start":{"line":40,"column":1},"end":{"line":41,"column":1}},"children":[{"type":"text","value":"We’ll then see how to estimate the ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"AhxRBFdXum"},{"type":"strong","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"policy gradient,","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"PztClrXmAZ"}],"key":"PeJ3hQ8LI0"},{"type":"text","value":"\nenabling us to apply (stochastic) gradient ascent in the RL setting.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"qruWwoL5EB"}],"key":"drU1RRJk7e"},{"type":"listItem","spread":true,"position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Then we’ll explore some ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"m2ebq9Qldc"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"proximal optimization","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"ltWybWH6e0"}],"key":"nckWkERfHf"},{"type":"text","value":" techniques that ensure the steps taken are “not too large”.\nThis is helpful to stabilize training and widely used in practice.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"BOCqViHSh5"}],"key":"vVTZ3yKCKd"}],"key":"yEJiThGlNv"}],"key":"a0wlVEyR1M"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import plt, Array, Callable, jax, jnp","key":"RsuYaKf2Md"},{"type":"output","id":"OZ6dVUBTB8FwMZL6FLhur","data":[],"key":"A3G64TJFWp"}],"data":{},"key":"hKpvewEIlg"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":49,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"Gradient Ascent","position":{"start":{"line":49,"column":1},"end":{"line":49,"column":1}},"key":"lNh6sSXO6t"}],"identifier":"gradient-ascent","label":"Gradient Ascent","html_id":"gradient-ascent","implicit":true,"enumerator":"6.2","key":"Zm2RUE3Csf"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"strong","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"Mj1Oxly8ql"}],"key":"DWocJPClPS"},{"type":"text","value":" is a general optimization algorithm for any differentiable function.\nA suitable analogy for this algorithm is hiking up a mountain,\nwhere you keep taking steps in the steepest direction upwards.\nHere, your vertical position ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"sRU1uCxqrB"},{"type":"inlineMath","value":"y","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"yyy","key":"xqM2xRLAqD"},{"type":"text","value":" is the function being optimized,\nand your horizontal position ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"r1rxqD7OnE"},{"type":"inlineMath","value":"(x, z)","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"(x,z)(x, z)(x,z)","key":"hQY3qtkkTH"},{"type":"text","value":" is the input to the function.\nThe ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"Ld2a5P4cmQ"},{"type":"emphasis","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"N6REF8CYl2"}],"key":"gqFYziMvqh"},{"type":"text","value":" of the mountain at your current position is given by the ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"sWoPdCKtqA"},{"type":"emphasis","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"gradient","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"nQi5fonyBb"}],"key":"IjmHz4E8Gm"},{"type":"text","value":",\nwritten ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"ak4pijQ5q5"},{"type":"inlineMath","value":"\\nabla y(x, z) \\in \\mathbb{R}^2","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"y(x,z)R2\\nabla y(x, z) \\in \\mathbb{R}^2y(x,z)R2","key":"AvQpRTIypG"},{"type":"text","value":".","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"Ib94hG092L"}],"key":"gOBMvGt0PK"}],"key":"pTvKsmw5po"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def f(x, y):\n \"\"\"Himmelblau's function\"\"\"\n return (x**2 + y - 11)**2 + (x + y**2 - 7)**2\n\n# Create a grid of points\nx = jnp.linspace(-5, 5, 400)\ny = jnp.linspace(-5, 5, 400)\nX, Y = jnp.meshgrid(x, y)\nZ = f(X, Y)\n\n# Create the plot\nfig, ax = plt.subplots(figsize=(6, 6))\n\n# Plot the function using imshow\nimg = ax.imshow(Z, extent=[-5, 5, -5, 5], origin='lower')\n\n# Add color bar\nfig.colorbar(img, ax=ax)\n\n# Gradient computation using JAX\ntx, ty = 1.0, 1.0\ngx, gy = jax.grad(f, argnums=(0, 1))(tx, ty)\n\n# Scatter point\nax.scatter(tx, ty, color='red', s=100)\n\n# Add arrow representing the gradient\nax.arrow(tx, ty, gx * 0.01, gy * 0.01, head_width=0.3, head_length=0.3, fc='blue', ec='blue')\n\n# Add plot title\nax.set_title(\"Himmelblau's Function\")\n\nplt.show()","key":"rdTn5cfcTJ"},{"type":"output","id":"gjM3gHhmiNODtO2TpAPUz","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"b8e65b5253271f49ddf227a711c3aa2c","path":"/build/b8e65b5253271f49ddf227a711c3aa2c.png"}}}],"key":"R63Vi0CkNP"}],"data":{},"key":"fs8n3bUw7e"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"text","value":"For differentiable functions, this can be thought of as the vector of partial derivatives,","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"XVKjjKJWYh"}],"key":"iIuF1xm8No"},{"type":"math","value":"\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.","position":{"start":{"line":97,"column":1},"end":{"line":102,"column":1}},"html":"y(x,z)=(yxyz).\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.y(x,z)=(xyzy).","enumerator":"6.1","key":"LJnZ7bMfnI"},{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"To calculate the ","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"YNm85J9oiX"},{"type":"emphasis","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"tANWCyYCwC"}],"key":"YD0BWRfbQS"},{"type":"text","value":" (aka “directional derivative”) of the mountain in a given direction ","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"G20K2OgOp7"},{"type":"inlineMath","value":"(\\Delta x, \\Delta z)","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"html":"(Δx,Δz)(\\Delta x, \\Delta z)(Δx,Δz)","key":"gE7l0i8PQM"},{"type":"text","value":",\nyou take the dot product of the difference vector with the gradient.\nThis means that the direction with the highest slope is exactly the gradient itself,\nso we can describe the gradient ascent algorithm as follows:","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"saOdKAw30v"}],"key":"x6ER6BMvsa"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"LkVLxxQc9V"}],"key":"WXVl3GEdkH"},{"type":"math","value":"\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})","position":{"start":{"line":110,"column":1},"end":{"line":120,"column":1}},"html":"(xk+1zk+1)=(xkzk)+ηy(xk,zk)\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})(xk+1zk+1)=(xkzk)+ηy(xk,zk)","enumerator":"6.2","key":"u07ksBYD3p"}],"enumerator":"6.1","key":"UejPQIQY0V"},{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"Pl8p8K9qTM"},{"type":"inlineMath","value":"k","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"kkk","key":"A9dyOJ8XBC"},{"type":"text","value":" denotes the iteration of the algorithm and ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"eDiVrNQ1w0"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"η>0\\eta > 0η>0","key":"oRmFRXz3qc"},{"type":"text","value":" is a “step size” hyperparameter that controls the size of the steps we take.\n(Note that we could also vary the step size across iterations, that is, ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"q9gnZlRVMA"},{"type":"inlineMath","value":"\\eta^0, \\dots, \\eta^K","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"η0,,ηK\\eta^0, \\dots, \\eta^Kη0,,ηK","key":"xzrh3wgxAH"},{"type":"text","value":".)","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"lkorJSsxXz"}],"key":"KlAVNIzsLJ"},{"type":"paragraph","position":{"start":{"line":126,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"The case of a two-dimensional input is easy to visualize.\nBut this idea can be straightforwardly extended to higher-dimensional inputs.","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"XN2vHn0QSt"}],"key":"Spbkcu7WvJ"},{"type":"paragraph","position":{"start":{"line":129,"column":1},"end":{"line":130,"column":1}},"children":[{"type":"text","value":"From now on, we’ll use ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"ttU4y6QCsO"},{"type":"inlineMath","value":"J","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"JJJ","key":"IOGZD8tF4e"},{"type":"text","value":" to denote the function we’re trying to maximize,\nand ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"ZY6NLVczGa"},{"type":"text","value":"θ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"rbrdGuURjL"},{"type":"text","value":" to denote the parameters being optimized over. (In the above example, ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"yMMpwOH0KP"},{"type":"inlineMath","value":"\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\top","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"θ=(xz)\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\topθ=(xz)","key":"yGi9853fxb"},{"type":"text","value":").","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"jKlFcimiHV"}],"key":"jjj3P7foi8"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"text","value":"Notice that our parameters will stop changing once ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"JObM8YDmp7"},{"type":"inlineMath","value":"\\nabla J(\\theta) = 0.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"J(θ)=0.\\nabla J(\\theta) = 0.J(θ)=0.","key":"O2r7qCpbgi"},{"type":"text","value":"\nOnce we reach this ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"RosB7cuFWI"},{"type":"strong","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"stationary point,","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"UgdKRJls9T"}],"key":"PO7K9up5wd"},{"type":"text","value":" our current parameters are ‘locally optimal’ in some sense;\nit’s impossible to increase the function by moving in any direction.\nIf ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"YprDLettKz"},{"type":"inlineMath","value":"J","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"JJJ","key":"nbBcUVSHor"},{"type":"text","value":" is ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"Bv4OewHxzl"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"convex","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"ooIoY8vP5w"}],"key":"ZT4SopYzhT"},{"type":"text","value":", then the only point where this happens is at the ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"eJNbKBrErR"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"global optimum.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"ZFZOewVLeL"}],"key":"EQ9aydyt9j"},{"type":"text","value":"\nOtherwise, if ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"eQD31e5yxR"},{"type":"inlineMath","value":"J","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"JJJ","key":"GWmsieYp0J"},{"type":"text","value":" is nonconvex, the best we can hope for is a ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"kPYr2jlNW2"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"local optimum.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"b3VJz0eG2O"}],"key":"djuPxPBy6O"}],"key":"aaMbldVrmF"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"JrNiZ5m92x"}],"key":"ZDNvEEClCJ"},{"type":"paragraph","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"children":[{"type":"text","value":"How does a computer compute the gradient of a function?","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"dcLAmfUe0X"}],"key":"mCUpk74cCb"},{"type":"paragraph","position":{"start":{"line":141,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"One way is ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"e41FtnLlZX"},{"type":"emphasis","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"symbolic differentiation,","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"KndrTL8myD"}],"key":"Ommqh69ccw"},{"type":"text","value":"\nwhich is similar to the way you might compute it by hand:\nthe computer applies a list of rules to transform the ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"uUIkykSoRZ"},{"type":"emphasis","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"d4SwZTPmXW"}],"key":"pH7x5JeCoH"},{"type":"text","value":" involved.\nPython’s ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"QzJ6rGazg0"},{"type":"inlineCode","value":"sympy","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"aZftUhfmrv"},{"type":"text","value":" package supports symbolic differentiation.\nHowever, functions implemented in code may not always have a straightforward symbolic representation.","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"yWRz4df0px"}],"key":"bcbO3d0IqT"},{"type":"paragraph","position":{"start":{"line":147,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"Another way is ","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"xe9NfCiD1s"},{"type":"emphasis","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"numerical differentiation,","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"mEMKcjwc7T"}],"key":"C7WZhfDDSt"},{"type":"text","value":"\nwhich is based on the limit definition of a (directional) derivative:","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"gNyBdefRSj"}],"key":"gq3iXV72Eb"},{"type":"math","value":"\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}","position":{"start":{"line":150,"column":1},"end":{"line":153,"column":1}},"html":"uJ(x)=limε0J(x+εu)J(x)ε\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}uJ(x)=ε0limεJ(x+εu)J(x)","enumerator":"6.3","key":"P0vhJuk9H7"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"text","value":"Then, we can substitute a small value of ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"hxg9x3IODN"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"ε\\varepsilonε","key":"bCa6ui39Pt"},{"type":"text","value":" on the r.h.s. to approximate the directional derivative.\nHow small, though? If we need an accurate estimate,\nwe may need such a small value of ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"PLefTDaEf7"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"ε\\varepsilonε","key":"AHuRMmPl4E"},{"type":"text","value":" that typical computers will run into rounding errors.\nAlso, to compute the full gradient,\nwe would need to compute the r.h.s. once for each input dimension.\nThis is an issue if computing ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"LpyMwDiKbd"},{"type":"inlineMath","value":"J","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"JJJ","key":"eGpaPRh6XQ"},{"type":"text","value":" is expensive.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"Qu8yZmdvnC"}],"key":"eNSQb9fRnX"},{"type":"paragraph","position":{"start":{"line":162,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"strong","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"Automatic differentiation","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"Ot48UwaYeQ"}],"key":"C5ptYK2qxC"},{"type":"text","value":" achieves the best of both worlds.\nLike symbolic differentiation,\nwe manually implement the derivative rules for a few basic operations.\nHowever, instead of executing these on the ","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"Rqt0XfShv1"},{"type":"emphasis","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"sCFrU7EMmp"}],"key":"mYImBrXnTE"},{"type":"text","value":",\nwe execute them on the ","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"VFjzGrNQ3j"},{"type":"emphasis","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"values","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"fFjX8cRoHQ"}],"key":"IffWRREnqx"},{"type":"text","value":" when the function gets called,\nlike in numerical differentiation.\nThis allows us to differentiate through programming constructs such as branches or loops,\nand doesn’t involve any arbitrarily small values.","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"Z1bByezqRY"}],"key":"L8RugeCHuk"}],"key":"ENOIoecqfE"}],"key":"beXzDo1re6"},{"type":"block","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Stochastic gradient ascent","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"xheTokJhHv"}],"identifier":"stochastic-gradient-ascent","label":"Stochastic gradient ascent","html_id":"stochastic-gradient-ascent","implicit":true,"enumerator":"6.2.1","key":"ivUWpRqVWI"},{"type":"paragraph","position":{"start":{"line":176,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"In real applications,\ncomputing the gradient of the target function is not so simple.\nAs an example from supervised learning, ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"m3OjtnDJhc"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"mGaNEBXoqj"},{"type":"text","value":" might be the sum of squared prediction errors across an entire training dataset.\nHowever, if our dataset is very large, it might not fit into our computer’s memory!\nIn these cases, we often compute some ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"JBHZh3jSog"},{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"estimate","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"OjYOLF0HUx"}],"key":"JkU8gVR3dn"},{"type":"text","value":" of the gradient at each step, ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"tRA6r588Qj"},{"type":"inlineMath","value":"\\tilde \\nabla J(\\theta)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"html":"~J(θ)\\tilde \\nabla J(\\theta)~J(θ)","key":"xys6c6g18X"},{"type":"text","value":", and walk in that direction instead.\nThis is called ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"px1acgwQEd"},{"type":"strong","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"jezzF6Bqs5"}],"key":"TsqTR0E5d4"},{"type":"text","value":" gradient ascent.\nIn the SL example above, we might randomly choose a ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"asuxtF72ya"},{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"pTWfb25Dmo"}],"key":"SmGPphIzQw"},{"type":"text","value":" of samples and use them to estimate the true prediction error. (This approach is known as ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"c8SNKoRf9e"},{"type":"strong","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"O9FaHoPik5"}],"key":"BQYmBfQXDg"},{"type":"text","value":" SGD","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"yLXvLAreXT"}],"key":"Ww3eZzG3lH"},{"type":"text","value":".)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"gH4oML0h3y"}],"key":"q4KdN5DRwD"}],"key":"LgjIjEh2Iu"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def sgd(\n θ_init: Array,\n estimate_gradient: Callable[[Array], Array],\n η: float,\n n_steps: int,\n):\n \"\"\"Perform `n_steps` steps of SGD.\n\n `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters.\n \"\"\"\n θ = θ_init\n for step in range(n_steps):\n θ += η * estimate_gradient(θ)\n return θ","key":"hm8VgtpfL5"},{"type":"output","id":"mfGgn8XU6jXK-xkjUro0g","data":[],"key":"lDiFbEPSta"}],"data":{},"key":"GE8YbUz8eQ"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":201,"column":1},"end":{"line":202,"column":1}},"children":[{"type":"text","value":"What makes one gradient estimator better than another?\nIdeally, we want this estimator to be ","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"zovLKjPAXk"},{"type":"strong","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"unbiased;","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"fYgQ494kRJ"}],"key":"riCX72k4zK"},{"type":"text","value":" that is, on average, it matches a single true gradient step:","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"YiMJ6u4dPz"}],"key":"N5KEn5BWOD"},{"type":"math","value":"\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"html":"E[~J(θ)]=J(θ).\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).E[~J(θ)]=J(θ).","enumerator":"6.4","key":"F2hMDK44Z0"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"We also want the ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"emOmeGI8Z4"},{"type":"emphasis","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"QojN9POdYD"}],"key":"RLNAPdnBfS"},{"type":"text","value":" of the estimator to be low so that its performance doesn’t change drastically at each step.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"DGFbHSyl9h"}],"key":"c4GYgN8P05"},{"type":"paragraph","position":{"start":{"line":210,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"TNFSs6g2e8"},{"type":"text","value":"θ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"DPgno0AzXX"},{"type":"text","value":" that is “close” to a stationary point.\nIn another perspective, for such functions, the local “landscape” of ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"jTjii8aPYi"},{"type":"inlineMath","value":"J","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"html":"JJJ","key":"XT9houtBC1"},{"type":"text","value":" around ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"Hon0NJkXQs"},{"type":"text","value":"θ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"MQxGSRgBtj"},{"type":"text","value":" becomes flatter and flatter the longer we run SGD.","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"wGtDy9CRPq"}],"key":"fuUnv5G4Oz"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"SGD convergence","position":{"start":{"line":213,"column":1},"end":{"line":213,"column":1}},"key":"Zz7ewR8YRB"}],"key":"PzLPZoINzZ"},{"type":"paragraph","position":{"start":{"line":214,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"More formally, suppose we run SGD for ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"tDFACgXQDN"},{"type":"inlineMath","value":"K","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"KKK","key":"eo1akjLJg1"},{"type":"text","value":" steps, using an unbiased gradient estimator.\nLet the step size ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"aif19MpNBn"},{"type":"inlineMath","value":"\\eta^k","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"ηk\\eta^kηk","key":"bLPJVLvpIn"},{"type":"text","value":" scale as ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"gQosT0CyOW"},{"type":"inlineMath","value":"O(1/\\sqrt{k}).","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"O(1/k).O(1/\\sqrt{k}).O(1/k).","key":"cwXLADRqvE"},{"type":"text","value":"\nThen if ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"QDL8s4GxiV"},{"type":"inlineMath","value":"J","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"JJJ","key":"DJTAs5dI9x"},{"type":"text","value":" is bounded and ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"vtOzCg3Q7Z"},{"type":"text","value":"β","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"M6R2oam2KH"},{"type":"text","value":"-smooth (see below),\nand the ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"GzX2FLoWM8"},{"type":"emphasis","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"text","value":"norm","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"z4ir2KDMMp"}],"key":"hL9NwN7jx4"},{"type":"text","value":" of the gradient estimator has a bounded second moment ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"sR3omwZgpH"},{"type":"inlineMath","value":"\\sigma^2,","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"σ2,\\sigma^2,σ2,","key":"XFJjp2eTY0"}],"key":"sJI4U8JcIc"},{"type":"math","value":"\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"html":"J(θK)2O(Mβσ2/K).\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).∥∇J(θK)2O(σ2/K).","enumerator":"6.5","key":"WROSZLaIvv"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"children":[{"type":"text","value":"We call a function ","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"LSIwEb4j1n"},{"type":"text","value":"β","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"DlHlbsopcQ"},{"type":"text","value":"-smooth if its gradient is Lipschitz continuous with constant ","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"dRZJw6HiWi"},{"type":"text","value":"β","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"RYBEHwl1SR"},{"type":"text","value":":","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"BdVRio1J0B"}],"key":"SEz8o0t9CO"},{"type":"math","value":"\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"J(θ)J(θ)βθθ.\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.∥∇J(θ)J(θ)βθθ∥.","enumerator":"6.6","key":"y1knzBvg52"}],"key":"OlOmGwhPEz"},{"type":"paragraph","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"We’ll now see a concrete application of gradient ascent in the context of policy optimization.","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"u1A0TbG3rv"}],"key":"tX5gpzz6Pf"}],"key":"LYi7X9FGSr"},{"type":"block","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Policy (stochastic) gradient ascent","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"dkbvhS4xwD"}],"identifier":"policy-stochastic-gradient-ascent","label":"Policy (stochastic) gradient ascent","html_id":"policy-stochastic-gradient-ascent","implicit":true,"enumerator":"6.3","key":"wdjRY3xZLQ"},{"type":"paragraph","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"Remember that in RL, the primary goal is to find the ","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"x4ltl8PcVn"},{"type":"emphasis","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"a38zotgJ5L"}],"key":"aoZVX7n5s0"},{"type":"text","value":" that achieves the maximimum total reward, which we can express using the value function we defined in ","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"X1Y7lXODD0"},{"type":"crossReference","kind":"proof:definition","identifier":"value","label":"value","children":[{"type":"text","value":"Definition ","key":"jcKRxO1J8m"},{"type":"text","value":"1.6","key":"hNI0ZbDtyk"}],"template":"Definition %s","enumerator":"1.6","resolved":true,"html_id":"value","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"AeYjAotbRG"},{"type":"text","value":":","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"ofGl1Uj1dP"}],"key":"Hw18qdYmK9"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E \\sum_{\\hi=0}^{\\hor-1} r_\\hi \\\\\n \\text{where} \\quad & s_0 \\sim \\mu_0 \\\\\n & s_{t+1} \\sim P(s_\\hi, a_\\hi), \\\\\n & a_\\hi = \\pi(s_\\hi) \\\\\n & r_\\hi = r(s_\\hi, a_\\hi).\n\\end{aligned}","label":"objective_fn","identifier":"objective_fn","html":"J(π):=Es0μ0Vπ(s0)=Eh=0H1rhwheres0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E \\sum_{\\hi=0}^{\\hor-1} r_\\hi \\\\\n \\text{where} \\quad & s_0 \\sim \\mu_0 \\\\\n & s_{t+1} \\sim P(s_\\hi, a_\\hi), \\\\\n & a_\\hi = \\pi(s_\\hi) \\\\\n & r_\\hi = r(s_\\hi, a_\\hi).\n\\end{aligned}J(π):=Es0μ0Vπ(s0)=whereEh=0H1rhs0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).","enumerator":"6.7","html_id":"objective-fn","key":"bLbyXZoXER"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"(Note that we’ll continue to work in the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"sCdlid74HQ"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"undiscounted, finite-horizon case.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"SbnCTeKZpb"}],"key":"dKXBcinSsH"},{"type":"text","value":" Analogous results hold for the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"hg9ZKzCCac"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"discounted, infinite-horizon case.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"iwxCrDj7uU"}],"key":"cWxo44vgYZ"},{"type":"text","value":")","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"qq9srCY6gl"}],"key":"F3X7jGVoc3"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":251,"column":1}},"children":[{"type":"text","value":"As shown by the notation, this is exactly the function ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"dwvDa2kbdm"},{"type":"inlineMath","value":"J","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"JJJ","key":"zEu6HnAKlM"},{"type":"text","value":" that we want to maximize using gradient ascent.\nWhat does ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"uGciK632ev"},{"type":"text","value":"θ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"n1XDjNl3EZ"},{"type":"text","value":" correspond to, though?\nIn general, ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"aNwEfiQNn0"},{"type":"text","value":"π","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"IkEeZhoxMI"},{"type":"text","value":" is a function, and optimizing over the space of arbitrary input-output mappings would be intractable.\nInstead, we need to describe ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"OQIrsJZwe6"},{"type":"text","value":"π","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"kMacjJ2kFP"},{"type":"text","value":" in terms of some finite set of ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"cqbIihGpFw"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"parameters","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"qwg05VSxZA"}],"key":"Nik6kwHyPf"},{"type":"text","value":" ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"kj8SzvB7Di"},{"type":"text","value":"θ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"Ukv3dqNiGl"},{"type":"text","value":".","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"C9MEfGNUNp"}],"key":"d99wvnK5b8"}],"key":"W3p7MOM82j"},{"type":"block","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Example policy parameterizations","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"s8b0nIQSUt"}],"label":"parameterizations","identifier":"parameterizations","html_id":"parameterizations","enumerator":"6.3.1","key":"rIURjfPNEs"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"What are some ways we could parameterize our policy?","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"XsQJcywGSC"}],"key":"gWjt3DPjmV"}],"key":"jHP1mfRcTO"},{"type":"block","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"children":[{"type":"text","value":"Tabular representation","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"boES1VB43t"}],"identifier":"tabular-representation","label":"Tabular representation","html_id":"tabular-representation","implicit":true,"enumerator":"6.3.1.1","key":"kOVXuzgzY1"},{"type":"paragraph","position":{"start":{"line":264,"column":1},"end":{"line":267,"column":1}},"children":[{"type":"text","value":"If both the state and action spaces are finite, perhaps we could simply learn a preference value ","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"R2lykxJtrF"},{"type":"inlineMath","value":"\\theta_{s,a}","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"html":"θs,a\\theta_{s,a}θs,a","key":"SPIiRKxZUg"},{"type":"text","value":" for each state-action pair.\nThen to turn this into a valid distribution, we perform a ","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"OoHHl2VVch"},{"type":"strong","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"children":[{"type":"text","value":"softmax","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"kgnvoT8qNx"}],"key":"F4jzL4H9hC"},{"type":"text","value":" operation:\nwe exponentiate each of them,\nand then normalize to form a valid distribution:","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"GM6SCrUp9V"}],"key":"H0K8gmnf51"},{"type":"math","value":"\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":"πθsoftmax(as)=exp(θs,a)s,aexp(θs,a).\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.πθsoftmax(as)=s,aexp(θs,a)exp(θs,a).","enumerator":"6.8","key":"EdQAcWsDep"},{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"However, this doesn’t make use of any structure in the states or actions,\nso while this is flexible, it is also prone to overfitting.","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"g7tKVKzvYn"}],"key":"W5Icc0XLLX"},{"type":"heading","depth":4,"position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Linear in features","position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"key":"pM6NRAu0RQ"}],"identifier":"linear-in-features","label":"Linear in features","html_id":"linear-in-features","implicit":true,"enumerator":"6.3.1.2","key":"eWwMNPh79E"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"Another approach is to map each state-action pair into some ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"qQkta9eJK9"},{"type":"strong","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"feature space","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"RJ8byYgzMi"}],"key":"KdH9SGrF0v"},{"type":"text","value":" ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"aMwoi3A9OZ"},{"type":"inlineMath","value":"\\phi(s, a) \\in \\mathbb{R}^p","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"ϕ(s,a)Rp\\phi(s, a) \\in \\mathbb{R}^pϕ(s,a)Rp","key":"dbe8KuJY7r"},{"type":"text","value":". Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"TZLehnOojO"}],"key":"wD1RyOlIjU"},{"type":"math","value":"\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"html":"πθlinear in features(as)=exp(θϕ(s,a))aexp(θϕ(s,a)).\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.πθlinear in features(as)=aexp(θϕ(s,a))exp(θϕ(s,a)).","enumerator":"6.9","key":"qTUr4yTdja"},{"type":"paragraph","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"children":[{"type":"text","value":"Another interpretation is that ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"mSKqFZmrn6"},{"type":"text","value":"θ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"vPregT7Adw"},{"type":"text","value":" represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"T6PCIw1RHA"},{"type":"text","value":"θ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"SvHvOQK2hh"},{"type":"text","value":" are given higher probability.","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"o0daDzpq6r"}],"key":"DFEt0miLoj"},{"type":"paragraph","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"children":[{"type":"text","value":"The score function for this parameterization is also quite elegant:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"h4hKdhKUbc"}],"key":"O24AFHK9QQ"},{"type":"math","value":"\\begin{aligned}\n \\nabla \\log \\pi_\\theta(a|s) &= \\nabla \\left( \\theta^\\top \\phi(s, a) - \\log \\left( \\sum_{a'} \\exp(\\theta^\\top \\phi(s, a')) \\right) \\right) \\\\\n &= \\phi(s, a) - \\E_{a' \\sim \\pi_\\theta(s)} \\phi(s, a')\n\\end{aligned}","position":{"start":{"line":284,"column":1},"end":{"line":289,"column":1}},"html":"logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)\\begin{aligned}\n \\nabla \\log \\pi_\\theta(a|s) &= \\nabla \\left( \\theta^\\top \\phi(s, a) - \\log \\left( \\sum_{a'} \\exp(\\theta^\\top \\phi(s, a')) \\right) \\right) \\\\\n &= \\phi(s, a) - \\E_{a' \\sim \\pi_\\theta(s)} \\phi(s, a')\n\\end{aligned}logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)","enumerator":"6.10","key":"HYWYAKWHbV"},{"type":"paragraph","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Plugging this into our policy gradient expression, we get","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"GfIAh9WQt3"}],"key":"ZyIb5xtcVE"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A_\\hi^{\\pi_\\theta}\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\left( \\phi(s_\\hi, a_\\hi) - \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s_\\hi, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi)\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\phi(s_\\hi, a_\\hi) A_\\hi^{\\pi_\\theta} (s_\\hi, a_\\hi) \\right]\n\\end{aligned}","position":{"start":{"line":293,"column":1},"end":{"line":302,"column":1}},"html":"J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]\\begin{aligned}\n \\nabla J(\\theta) & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A_\\hi^{\\pi_\\theta}\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\left( \\phi(s_\\hi, a_\\hi) - \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s_\\hi, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi)\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\phi(s_\\hi, a_\\hi) A_\\hi^{\\pi_\\theta} (s_\\hi, a_\\hi) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]","enumerator":"6.11","key":"gG7vcoqRjW"},{"type":"paragraph","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"children":[{"type":"text","value":"Why can we drop the ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"hAigxrpy46"},{"type":"inlineMath","value":"\\E \\phi(s_\\hi, a')","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eϕ(sh,a)\\E \\phi(s_\\hi, a')Eϕ(sh,a)","key":"xFQtXPLHWS"},{"type":"text","value":" term? By linearity of expectation, consider the dropped term at a single timestep: ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"sysC2UoYOd"},{"type":"inlineMath","value":"\\E_{\\tau \\sim \\rho_\\theta} \\left[ \\left( \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi) \\right].","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].\\E_{\\tau \\sim \\rho_\\theta} \\left[ \\left( \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi) \\right].Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].","key":"WccUdO615T"},{"type":"text","value":" By Adam’s Law, we can wrap the advantage term in a conditional expectation on the state ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"DIP0vvitSQ"},{"type":"inlineMath","value":"s_\\hi.","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"sh.s_\\hi.sh.","key":"EsJLbYnHtl"},{"type":"text","value":" Then we already know that ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"bfuKW2TdDy"},{"type":"inlineMath","value":"\\E_{a \\sim \\pi(s)} A_\\hi^{\\pi}(s, a) = 0,","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eaπ(s)Ahπ(s,a)=0,\\E_{a \\sim \\pi(s)} A_\\hi^{\\pi}(s, a) = 0,Eaπ(s)Ahπ(s,a)=0,","key":"Kw5w6EVZfV"},{"type":"text","value":" and so this entire term vanishes.","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"ZJkIZckX6M"}],"key":"Izzbh0730D"},{"type":"heading","depth":4,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Neural policies","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"DO2NrbBip5"}],"identifier":"neural-policies","label":"Neural policies","html_id":"neural-policies","implicit":true,"enumerator":"6.3.1.3","key":"bDqG6fwQk4"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"More generally, we could map states and actions to unnormalized scores via some parameterized function ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"uMnEdmHNXj"},{"type":"inlineMath","value":"f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"fθ:S×AR,f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},fθ:S×AR,","key":"mX2yk2XV7U"},{"type":"text","value":" such as a neural network, and choose actions according to a softmax: ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"YjpOjdIGvB"}],"key":"Z1yt9NHJGA"},{"type":"math","value":"\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"tight":"before","html":"πθgeneral(as)=exp(fθ(s,a))aexp(fθ(s,a)).\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.πθgeneral(as)=aexp(fθ(s,a))exp(fθ(s,a)).","enumerator":"6.12","key":"AwUjRAKaOl"},{"type":"paragraph","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"children":[{"type":"text","value":"The score can then be written as ","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"u2rumNoSX6"}],"key":"tWIGOXcv1U"},{"type":"math","value":"\\nabla \\log \\pi_\\theta(a|s) = \\nabla f_\\theta(s, a) - \\E_{a \\sim \\pi_\\theta(s)} \\nabla f_\\theta (s, a')","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"tight":"before","html":"logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)\\nabla \\log \\pi_\\theta(a|s) = \\nabla f_\\theta(s, a) - \\E_{a \\sim \\pi_\\theta(s)} \\nabla f_\\theta (s, a')logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)","enumerator":"6.13","key":"ySUoCwP7NR"}],"key":"zob6gqKIVf"},{"type":"block","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"children":[{"type":"text","value":"Continuous action spaces","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"DDjSMzfDsz"}],"identifier":"continuous-action-spaces","label":"Continuous action spaces","html_id":"continuous-action-spaces","implicit":true,"enumerator":"6.3.2","key":"WyFEvFJv6o"},{"type":"paragraph","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Consider a continuous ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"Xa15dFstPL"},{"type":"inlineMath","value":"n","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"nnn","key":"lOJom32FH7"},{"type":"text","value":"-dimensional action space ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"QykMjkLWzl"},{"type":"inlineMath","value":"\\mathcal{A} = \\mathbb{R}^n","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"A=Rn\\mathcal{A} = \\mathbb{R}^nA=Rn","key":"LsYEtkFI1e"},{"type":"text","value":". Then for a stochastic policy, we could use a function to predict the ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"Z0sZuHtWbd"},{"type":"emphasis","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"mean","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"YF4aaldHmr"}],"key":"TAcYORum6k"},{"type":"text","value":" action and then add some random noise about it. For example, we could use a neural network to predict the mean action ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"i1fIWZ8AaX"},{"type":"inlineMath","value":"\\mu_\\theta(s)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"μθ(s)\\mu_\\theta(s)μθ(s)","key":"pE7GFQdmgv"},{"type":"text","value":" and then add some noise ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"IWbQzT5jQN"},{"type":"inlineMath","value":"\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"ϵN(0,σ2I)\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)ϵN(0,σ2I)","key":"asd8OyhjiG"},{"type":"text","value":" to it:","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"ex9lgUdcYt"}],"key":"dgi2cc8K2N"},{"type":"math","value":"\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"html":"πθ(as)=N(μθ(s),σ2I).\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).πθ(as)=N(μθ(s),σ2I).","enumerator":"6.14","key":"EyOODeBS5L"},{"type":"comment","value":" **Exercise:** Can you extend the \"linear in features\" policy to continuous action spaces in a similar way? ","key":"ecEs6kxheS"}],"key":"ObOtFWgDJK"},{"type":"block","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"children":[{"type":"text","value":"Now that we have seen parameterized policies, we can now write the total reward in terms of the parameters:","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"M5B54VZOb6"}],"key":"HQxqGa2Wh7"},{"type":"math","value":"J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau).","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"html":"J(θ)=EτρθR(τ).J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau).J(θ)=EτρθR(τ).","enumerator":"6.15","key":"KF5W0Uk762"},{"type":"paragraph","position":{"start":{"line":328,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"text","value":"Now how do we maximize this function (the expected total reward) over the parameters?\nOne simple idea would be to directly apply gradient ascent:","position":{"start":{"line":328,"column":1},"end":{"line":328,"column":1}},"key":"tqXFEjFrXK"}],"key":"xi4rhdA9bQ"},{"type":"math","value":"\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).","position":{"start":{"line":331,"column":1},"end":{"line":333,"column":1}},"html":"θk+1=θk+ηJ(θk).\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).θk+1=θk+ηJ(θk).","enumerator":"6.16","key":"t7GG4cvcV4"},{"type":"paragraph","position":{"start":{"line":335,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"In order to apply this technique, we need to be able to evaluate the gradient ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"Piv8rI7qJK"},{"type":"inlineMath","value":"\\nabla J(\\theta).","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"J(θ).\\nabla J(\\theta).J(θ).","key":"JUuiQQGx5D"},{"type":"text","value":"\nBut ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"lhzSTCGGm4"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"Psmy4FCdBw"},{"type":"text","value":" is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"dLzKYJgITG"},{"type":"inlineMath","value":"\\tau.","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"τ.\\tau.τ.","key":"wjQtznvflh"},{"type":"text","value":"\nCan we rewrite it in a form that’s more convenient to implement?","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"Rh5hAsBVwZ"}],"key":"wUP8SPaCph"}],"key":"Zrgyrr0jro"},{"type":"block","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":342,"column":1},"end":{"line":342,"column":1}},"children":[{"type":"text","value":"Importance Sampling","position":{"start":{"line":342,"column":1},"end":{"line":342,"column":1}},"key":"hradrCfEGt"}],"label":"importance_sampling","identifier":"importance_sampling","html_id":"importance-sampling","enumerator":"6.3.3","key":"otn2XS1Nqc"},{"type":"paragraph","position":{"start":{"line":344,"column":1},"end":{"line":352,"column":1}},"children":[{"type":"text","value":"There is a general trick called ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"iAaWNwMl7P"},{"type":"strong","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"importance sampling","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"y9cJw9mwNY"}],"key":"Hr5t2XByKV"},{"type":"text","value":" for evaluating such expectations.\nSuppose we want to estimate ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"TLa32vGXN4"},{"type":"inlineMath","value":"\\E_{x \\sim p}[f(x)]","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"Exp[f(x)]\\E_{x \\sim p}[f(x)]Exp[f(x)]","key":"fx0HveYxzS"},{"type":"text","value":" where ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"kbSUAATsUR"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"ss2VDtPOOo"},{"type":"text","value":" is hard or expensive to sample from. We can, however, evaluate the likelihood ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"PxomQwDPov"},{"type":"inlineMath","value":"p(x)","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"p(x)p(x)p(x)","key":"ku8Psu85ti"},{"type":"text","value":".\nSuppose that we ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"J0mLfaOV87"},{"type":"emphasis","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"can","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"nuSVJSUTWW"}],"key":"wxlBWGrnQK"},{"type":"text","value":" sample from a different distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"W352elm1HD"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"k56zHVDf6G"},{"type":"text","value":".\nSince an expectation is just a weighted average, we can sample ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"meuN694yKZ"},{"type":"inlineMath","value":"x","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"xxx","key":"sL1fvAleLI"},{"type":"text","value":" from ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"E3JAFuLNj6"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"DfN4mhjyUM"},{"type":"text","value":", compute ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"qG3AljeQZW"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"f(x)f(x)f(x)","key":"gAUr0K1igq"},{"type":"text","value":", and then reweight the results:\nif ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"v3a0SWdW6f"},{"type":"inlineMath","value":"x","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"xxx","key":"j83N0T6ASF"},{"type":"text","value":" is very likely under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"R402lIqflP"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"tQTr8gHF9q"},{"type":"text","value":" but unlikely under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Zuh3lFpID7"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"oAcqPNrcIn"},{"type":"text","value":",\nwe should boost its weighting,\nand if it is common under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"mbLSYYa3JL"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"ew9fyu7qnn"},{"type":"text","value":" but uncommon under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"p4vhqc6zvf"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"hDRCt5eofr"},{"type":"text","value":",\nwe should lower its weighting.\nThe reweighting factor is exactly the ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"OJ05IOAgmn"},{"type":"strong","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"likelihood ratio","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"t6G0NWDIvR"}],"key":"iix0sCJ1hD"},{"type":"text","value":" between the target distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"U36ptZRQa9"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"TbdGRarwmf"},{"type":"text","value":" and the sampling distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"XrbPRf2y9b"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"AhqGK13By4"},{"type":"text","value":":","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"kcVC9VBQXc"}],"key":"K97VkG3ZXv"},{"type":"math","value":"\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].","position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"html":"Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].Exp[f(x)]=xXf(x)p(x)=xXf(x)q(x)p(x)q(x)=Exq[q(x)p(x)f(x)].","enumerator":"6.17","key":"V9B3AzJBZ7"},{"type":"paragraph","position":{"start":{"line":358,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"jzKkW9yGy9"},{"type":"emphasis","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"Jgw9Fd5zD4"}],"key":"tzqFQzTrPG"},{"type":"text","value":" expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term.\nIf there are values of ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"QnMXgqq9o6"},{"type":"inlineMath","value":"x","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"xxx","key":"Dpt5BZMXgt"},{"type":"text","value":" that are very rare in the sampling distribution ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"CqaH4EWueS"},{"type":"inlineMath","value":"q","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"qqq","key":"hGxJLBdR6X"},{"type":"text","value":",\nbut common under ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"ePvWTnCyDs"},{"type":"inlineMath","value":"p","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"ppp","key":"KPYz9W57lL"},{"type":"text","value":",\nthen the likelihood ratio ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"oj0T33IF12"},{"type":"inlineMath","value":"p(x)/q(x)","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"p(x)/q(x)p(x)/q(x)p(x)/q(x)","key":"cLe2HHcpsG"},{"type":"text","value":" will cause the variance to blow up.","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"qaw6tMtS9b"}],"key":"gNuhBIQk2P"},{"type":"heading","depth":2,"position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"children":[{"type":"text","value":"The REINFORCE policy gradient","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"WVqwLzaKRM"}],"identifier":"the-reinforce-policy-gradient","label":"The REINFORCE policy gradient","html_id":"the-reinforce-policy-gradient","implicit":true,"enumerator":"6.4","key":"rKrxSb8bDc"},{"type":"paragraph","position":{"start":{"line":365,"column":1},"end":{"line":367,"column":1}},"children":[{"type":"text","value":"Returning to RL, suppose there is some trajectory distribution ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"UiIMo8xmgF"},{"type":"inlineMath","value":"\\rho(\\tau)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"ρ(τ)\\rho(\\tau)ρ(τ)","key":"yZmgPmk8kq"},{"type":"text","value":" that is ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"KTMB6UJOZz"},{"type":"strong","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"easy to sample from,","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"hSn0YnCN8U"}],"key":"I1RDu7Euja"},{"type":"text","value":" such as a database of existing trajectories.\nWe can then rewrite ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"wY742J3BbY"},{"type":"inlineMath","value":"\\nabla J(\\theta)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"J(θ)\\nabla J(\\theta)J(θ)","key":"I7kEMsEGcl"},{"type":"text","value":", a.k.a. the ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"bpjrNI4IrD"},{"type":"emphasis","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"policy gradient","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"reEskoECoH"}],"key":"uKHanDabmK"},{"type":"text","value":", as follows.\nAll gradients are being taken with respect to ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"UvzGzGzmGn"},{"type":"text","value":"θ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"lJ5aqYPsaB"},{"type":"text","value":".","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"RCwKWLTmB0"}],"key":"U5MddxxcUU"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}","position":{"start":{"line":369,"column":1},"end":{"line":375,"column":1}},"html":"J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}J(θ)=Eτρθ[R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]likelihood ratio trickswitching gradient and expectation","enumerator":"6.18","key":"lyqrAeZqCL"},{"type":"paragraph","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"children":[{"type":"text","value":"Note that for ","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"RInR6Aq3rM"},{"type":"inlineMath","value":"\\rho = \\rho_\\theta","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"html":"ρ=ρθ\\rho = \\rho_\\thetaρ=ρθ","key":"u5A10jfxUm"},{"type":"text","value":", the inside term becomes","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"IebXv3VQjX"}],"key":"hK3AnnSKqF"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].","position":{"start":{"line":379,"column":1},"end":{"line":381,"column":1}},"html":"J(θ)=Eτρθ[logρθ(τ)R(τ)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].J(θ)=Eτρθ[logρθ(τ)R(τ)].","enumerator":"6.19","key":"Mp7LXs97fP"},{"type":"paragraph","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"(The order of operations is ","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"FenzfUdns0"},{"type":"inlineMath","value":"\\nabla (\\log \\rho_\\theta)(\\tau)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"html":"(logρθ)(τ)\\nabla (\\log \\rho_\\theta)(\\tau)(logρθ)(τ)","key":"twjtGQVJ7G"},{"type":"text","value":".)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"Fp77GCPVxu"}],"key":"w1gbuUxdbY"},{"type":"paragraph","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"Note that when the state transitions are Markov (i.e. ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"qD28y6T7a8"},{"type":"inlineMath","value":"s_{t}","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"sts_{t}st","key":"A5DE1l8QxP"},{"type":"text","value":" only depends on ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"QlNK9gqUTq"},{"type":"inlineMath","value":"s_{t-1}, a_{t-1}","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"st1,at1s_{t-1}, a_{t-1}st1,at1","key":"RqEK54X1wM"},{"type":"text","value":") and the policy is time-homogeneous (i.e. ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"poUG9T3HK0"},{"type":"inlineMath","value":"a_\\hi \\sim \\pi_\\theta (s_\\hi)","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"ahπθ(sh)a_\\hi \\sim \\pi_\\theta (s_\\hi)ahπθ(sh)","key":"L4SFZehBEz"},{"type":"text","value":"), we can write out the ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"q4TmvaqzzX"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"likelihood of a trajectory","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"BPnH6wskr2"}],"key":"pQhfSidQ6X"},{"type":"text","value":" under the policy ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"eg4Lvlbc5G"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"TsRXhMpGTW"},{"type":"text","value":":","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"MDe5uj5EYe"}],"key":"sMM4uECEWA"},{"type":"math","value":"\\begin{aligned}\n \\rho_\\theta(\\tau) &= \\mu(s_0) \\pi_\\theta(a_0 | s_0) \\\\\n &\\qquad \\times P(s_1 | s_0, a_0) \\pi_\\theta(a_1 | s_1) \\\\\n &\\qquad \\times \\cdots \\\\\n &\\qquad \\times P(s_{H-1} | s_{H-2}, a_{H-2}) \\pi_\\theta(a_{H-1} | s_{H-1}).\n\\end{aligned}","label":"trajectory_likelihood","identifier":"trajectory_likelihood","html":"ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).\\begin{aligned}\n \\rho_\\theta(\\tau) &= \\mu(s_0) \\pi_\\theta(a_0 | s_0) \\\\\n &\\qquad \\times P(s_1 | s_0, a_0) \\pi_\\theta(a_1 | s_1) \\\\\n &\\qquad \\times \\cdots \\\\\n &\\qquad \\times P(s_{H-1} | s_{H-2}, a_{H-2}) \\pi_\\theta(a_{H-1} | s_{H-1}).\n\\end{aligned}ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).","enumerator":"6.20","html_id":"trajectory-likelihood","key":"BI5R8Wlcti"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"Note that the log-trajectory-likelihood turns into a sum of terms,\nof which only the ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"YJ9hcKUuQ1"},{"type":"inlineMath","value":"\\pi_\\theta(a_\\hi | s_\\hi)","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"πθ(ahsh)\\pi_\\theta(a_\\hi | s_\\hi)πθ(ahsh)","key":"VbVKIDRBwY"},{"type":"text","value":" terms depend on ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"wmB3sCpWl1"},{"type":"inlineMath","value":"\\theta,","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"θ,\\theta,θ,","key":"mZrjpSaEHs"},{"type":"text","value":"\nso we can simplify even further to obtain the following expression for the policy gradient, known as the “REINFORCE” policy gradient:","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"JldPluKaur"}],"key":"UGsCvN31Wt"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}","label":"reinforce_pg","identifier":"reinforce_pg","html":"J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]","enumerator":"6.21","html_id":"reinforce-pg","key":"yJ0edqUUIC"},{"type":"paragraph","position":{"start":{"line":410,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"This expression allows us to estimate the gradient by sampling a few sample trajectories from ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"JkZZ82IjVU"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"xBRBpatfPm"},{"type":"text","value":"\ncalculating the likelihoods of the chosen actions,\nand substituting these into the expression above.\nWe can then use this gradient estimate to apply stochastic gradient ascent.","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"Xa8HKzlKKN"}],"key":"COZT3yBh3w"},{"type":"code","lang":"python","value":"def estimate_gradient_reinforce_pseudocode(env, π, θ):\n τ = sample_trajectory(env, π(θ))\n gradient_hat = 0\n for s, a, r in τ:\n def policy_log_likelihood(θ):\n return log(π(θ)(s, a))\n gradient_hat += jax.grad(policy_log_likelihood)(θ) * τ.total_reward\n return gradient_hat","position":{"start":{"line":415,"column":1},"end":{"line":424,"column":1}},"key":"AVh3JxVlYj"},{"type":"paragraph","position":{"start":{"line":426,"column":1},"end":{"line":429,"column":1}},"children":[{"type":"text","value":"In fact, we can perform one more simplification.\nIntuitively, the action taken at step ","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"peyBfzb8j7"},{"type":"inlineMath","value":"t","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"html":"ttt","key":"jUFqfILoPV"},{"type":"text","value":" does not affect the reward from previous timesteps, since they’re already in the past!\nYou can also show rigorously that this is the case,\nand that we only need to consider the present and future rewards to calculate the policy gradient:","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"t9Di0aq2pt"}],"key":"tgQBz8jlTf"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \\right] \\\\\n &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{t}, a_{t}) \\right]\n\\end{aligned}","label":"pg_with_q","identifier":"pg_with_q","html":"J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]\\begin{aligned}\n \\nabla J(\\theta) &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \\right] \\\\\n &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{t}, a_{t}) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]","enumerator":"6.22","html_id":"pg-with-q","key":"BZOHzhtxoI"},{"type":"paragraph","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"strong","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"AaIMZclGtH"}],"key":"G1v227cgm2"},{"type":"text","value":" Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"Rvynz0H2Vd"}],"key":"q7MH92CVFN"},{"type":"paragraph","position":{"start":{"line":442,"column":1},"end":{"line":442,"column":1}},"children":[{"type":"text","value":"For some intuition into how this method works, recall that we update our parameters according to","position":{"start":{"line":442,"column":1},"end":{"line":442,"column":1}},"key":"aJsiLcE4yt"}],"key":"KwI4yilcLf"},{"type":"math","value":"\\begin{aligned}\n \\theta_{t+1} &= \\theta_\\hi + \\eta \\nabla J(\\theta_\\hi) \\\\\n &= \\theta_\\hi + \\eta \\E_{\\tau \\sim \\rho_{\\theta_\\hi}} [\\nabla \\log \\rho_{\\theta_\\hi}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}","position":{"start":{"line":444,"column":1},"end":{"line":449,"column":1}},"html":"θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].\\begin{aligned}\n \\theta_{t+1} &= \\theta_\\hi + \\eta \\nabla J(\\theta_\\hi) \\\\\n &= \\theta_\\hi + \\eta \\E_{\\tau \\sim \\rho_{\\theta_\\hi}} [\\nabla \\log \\rho_{\\theta_\\hi}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].","enumerator":"6.23","key":"LxuHsOQFPQ"},{"type":"paragraph","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"text","value":"Consider the “good” trajectories where ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"li0rJDeb70"},{"type":"inlineMath","value":"R(\\tau)","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"R(τ)R(\\tau)R(τ)","key":"dZ3KQzWnhC"},{"type":"text","value":" is large. Then ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"ueUsldvvS5"},{"type":"text","value":"θ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"LsGniAkS3j"},{"type":"text","value":" gets updated so that these trajectories become more likely. To see why, recall that ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"Wxjvy4gvxT"},{"type":"inlineMath","value":"\\rho_{\\theta}(\\tau)","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"ρθ(τ)\\rho_{\\theta}(\\tau)ρθ(τ)","key":"lIEHlHjFNj"},{"type":"text","value":" is the likelihood of the trajectory ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"CW5nwP0JUA"},{"type":"text","value":"τ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"bos4p8k5ix"},{"type":"text","value":" under the policy ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"N4CjD6Odw2"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"GLCqniU6xA"},{"type":"text","value":" so evaluating the gradient points in the direction that makes ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"OceugIsvak"},{"type":"text","value":"τ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"zNPWGxtwf8"},{"type":"text","value":" more likely.","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"UOugsslJDL"}],"key":"iMlG8yfrLr"}],"key":"he6Ubj53fG"},{"type":"block","position":{"start":{"line":453,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"PGgdpZeeBF"}],"identifier":"baselines-and-advantages","label":"Baselines and advantages","html_id":"baselines-and-advantages","implicit":true,"enumerator":"6.5","key":"oE7nZnGT98"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":460,"column":1}},"children":[{"type":"text","value":"A central idea from supervised learning is the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"aY8dQLgorF"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"bias-variance decomposition","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"r4SlfbSBLL"}],"key":"L6S7HbMkwS"},{"type":"text","value":",\nwhich shows that the mean squared error of an estimator is the sum of its squared bias and its variance.\nThe REINFORCE gradient estimator ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"BtD92bdb6M"},{"type":"crossReference","kind":"equation","identifier":"reinforce_pg","label":"reinforce_pg","children":[{"type":"text","value":"(","key":"Y2oisRl19J"},{"type":"text","value":"6.21","key":"q7HE9bvJOI"},{"type":"text","value":")","key":"nqdK4fA3Jz"}],"template":"(%s)","enumerator":"6.21","resolved":true,"html_id":"reinforce-pg","key":"GEEF8YAzo6"},{"type":"text","value":" is already ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"RRxtenpEnl"},{"type":"emphasis","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"unbiased,","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"Wv8ZStmhaG"}],"key":"d9NkjVSkdL"},{"type":"text","value":" meaning that its expectation over trajectories is the true policy gradient.\nCan we find ways to reduce its ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"myFZnCX8Fy"},{"type":"emphasis","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"UTeTABvvns"}],"key":"eeqBI4OHtV"},{"type":"text","value":" as well?","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"NAjF0IILah"}],"key":"Ivaf0enzDv"},{"type":"paragraph","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"One common way is to subtract a ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"JzGwTL0tRN"},{"type":"strong","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"baseline function","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"jgTCUEaFq0"}],"key":"QDCagMX9Po"},{"type":"text","value":" ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"o72PknLJBJ"},{"type":"inlineMath","value":"b_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"bh:SRb_\\hi : \\mathcal{S} \\to \\mathbb{R}bh:SR","key":"ivwToUwBQi"},{"type":"text","value":" at each timestep ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"JnnmugRkHu"},{"type":"inlineMath","value":"\\hi.","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"h.\\hi.h.","key":"YVLsFmaHqb"},{"type":"text","value":" This modifies the policy gradient as follows:","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"ymdvxxJFmF"}],"key":"xFfPIhectb"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n \\left(\n \\sum_{\\hi' = \\hi}^{H-1} r_{\\hi'}\n \\right)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].","position":{"start":{"line":464,"column":1},"end":{"line":474,"column":1}},"identifier":"eq:pg_baseline","label":"eq:pg_baseline","html_id":"eq-pg-baseline","html":"J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n \\left(\n \\sum_{\\hi' = \\hi}^{H-1} r_{\\hi'}\n \\right)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].","enumerator":"6.24","key":"WtItoOLD1g"},{"type":"paragraph","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"children":[{"type":"text","value":"For example, we might want ","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"H1luaJ5Hf8"},{"type":"inlineMath","value":"b_\\hi","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"html":"bhb_\\hibh","key":"lQw2P2Qrxo"},{"type":"text","value":" to estimate the average reward-to-go at a given timestep:","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"dtfm4brN1a"}],"key":"eRJWxCnUOj"},{"type":"math","value":"b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"html":"bhθ=EτρθRh(τ).b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).bhθ=EτρθRh(τ).","enumerator":"6.25","key":"X7oOChq05p"},{"type":"paragraph","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"This way, the random variable ","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"psFiPYvQWC"},{"type":"inlineMath","value":"R_\\hi(\\tau) - b_\\hi^\\theta","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"html":"Rh(τ)bhθR_\\hi(\\tau) - b_\\hi^\\thetaRh(τ)bhθ","key":"pkoBkxBm37"},{"type":"text","value":" is centered around zero, making certain algorithms more stable.","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"fBYgq0GUxX"}],"key":"sCOz7KYSsc"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"As a better baseline, we could instead choose the ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"lmw7sopRDS"},{"type":"emphasis","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"value function.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"Lxm9Z42R31"}],"key":"ATHkyQxKmF"},{"type":"text","value":"\nNote that the random variable ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"aOqcxDjmkR"},{"type":"inlineMath","value":"Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"Qhπ(s,a)Vhπ(s),Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),Qhπ(s,a)Vhπ(s),","key":"RVVOGm1t4e"},{"type":"text","value":"\nwhere the randomness is taken over the actions, is also centered around zero.\n(Recall ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"bHuMEKJrJR"},{"type":"inlineMath","value":"V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"Vhπ(s)=EaπQhπ(s,a).V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).Vhπ(s)=EaπQhπ(s,a).","key":"w8HbX3DKzw"},{"type":"text","value":")\nIn fact, this quantity has a particular name: the ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"AUVGpCJHEr"},{"type":"strong","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"advantage function.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"DQ53NkoQOI"}],"key":"gAFvOPfUi0"},{"type":"text","value":"\nThis measures how much better this action does than the average for that policy.\n(Note that for an optimal policy ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"prXPimSqdc"},{"type":"inlineMath","value":"\\pi^\\star,","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"π,\\pi^\\star,π,","key":"cFDdIkEMN8"},{"type":"text","value":" the advantage of a given state-action pair is always zero or negative.)","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"YLMp9P5Xg6"}],"key":"AW9mclw15Q"},{"type":"paragraph","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"children":[{"type":"text","value":"We can now express the policy gradient as follows. Note that the advantage function effectively replaces the ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"TPNOXfEm30"},{"type":"inlineMath","value":"Q","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"html":"QQQ","key":"npJYMH9wat"},{"type":"text","value":"-function from ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"Ii8GFlY6ec"},{"type":"crossReference","kind":"equation","identifier":"pg_with_q","label":"pg_with_q","children":[{"type":"text","value":"(","key":"kjh30yFmAe"},{"type":"text","value":"6.22","key":"k3E7d14NZ1"},{"type":"text","value":")","key":"Rw2BXL7aCS"}],"template":"(%s)","enumerator":"6.22","resolved":true,"html_id":"pg-with-q","key":"MXlxCk5eC6"},{"type":"text","value":":","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"DSyNkblcKt"}],"key":"qmVfmcRTCo"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].","label":"pg_advantage","identifier":"pg_advantage","html":"J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].","enumerator":"6.26","html_id":"pg-advantage","key":"pkDO2pbko4"},{"type":"paragraph","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Note that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories:","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"BU62r5Qatd"}],"key":"XcJJhqyXxY"},{"type":"comment","value":" TODO could use more explanation _why_ we want to avoid correlations ","key":"cSZxy3ZOJf"},{"type":"proof","kind":"definition","label":"pg_baseline","identifier":"pg_baseline","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy gradient with a learned baseline","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"key":"UbHCGp83Z5"}],"key":"t6we8sNMzr"},{"type":"code","lang":"python","value":"def pg_with_learned_baseline_pseudocode(env, π, η, θ_init, K, N):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), N)\n V_hat = fit(trajectories) # estimates the value function of π(θ)\n τ = sample_trajectories(env, π(θ), 1)\n g = jnp.zeros_like(θ) # gradient estimator\n\n for h, (s, a) in enumerate(τ):\n def log_likelihood(θ_):\n return jnp.log(π(θ_)(s, a))\n g = g + jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s))\n \n θ = θ + η * g\n return θ","position":{"start":{"line":507,"column":1},"end":{"line":523,"column":1}},"key":"HG4SoilxNw"},{"type":"paragraph","position":{"start":{"line":525,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"text","value":"Note that you could also generalize this by allowing the learning rate ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"WJYv4Sejb6"},{"type":"text","value":"η","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"ZW2u1jq0t6"},{"type":"text","value":" to vary across steps,\nor take multiple trajectories ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"Pwi3j35UkO"},{"type":"text","value":"τ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"yeeC2xjYpF"},{"type":"text","value":" and compute the sample average of the gradient estimates.","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"m5NO97Gp89"}],"key":"NgRltmVQQK"},{"type":"paragraph","position":{"start":{"line":528,"column":1},"end":{"line":529,"column":1}},"children":[{"type":"text","value":"The baseline estimation step ","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"Gin9Fl1Xfi"},{"type":"inlineCode","value":"fit","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"MWzEaCbZkA"},{"type":"text","value":" can be done using any appropriate supervised learning algorithm.\nNote that the gradient estimator will be unbiased regardless of the baseline.","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"csWrfgEDUM"}],"key":"t1ZHHBuP5X"}],"enumerator":"6.2","html_id":"pg-baseline","key":"deGg7gzOPy"}],"key":"z2qVjYLpZM"},{"type":"block","position":{"start":{"line":532,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"Comparing policy gradient algorithms to policy iteration","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"FymGW9hXBj"}],"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","label":"Comparing policy gradient algorithms to policy iteration","html_id":"comparing-policy-gradient-algorithms-to-policy-iteration","implicit":true,"enumerator":"6.6","key":"X7tnvZTFk6"},{"type":"comment","value":" TODO maybe restructure this part ","key":"XPtmG5iGnG"},{"type":"paragraph","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"What advantages does the policy gradient algorithm have over ","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"A7G4VBz9PA"},{"type":"crossReference","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Section ","key":"mSMgPNqRCs"},{"type":"text","value":"1.5.3.2","key":"n3aQchWNrV"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"bSkLIoAp4m"},{"type":"text","value":"?","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"IadHcSEkB0"}],"key":"fUWM9U06Ew"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy iteration recap","position":{"start":{"line":540,"column":1},"end":{"line":540,"column":1}},"key":"PeZSpZjPow"}],"key":"Cja3vle470"},{"type":"paragraph","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Recall that policy iteration is an algorithm for MDPs with unknown state transitions where we alternate between these two steps:","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"eUqHcIIVfw"}],"key":"ZKS81rz5zc"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":543,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"Estimating the ","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"diwU9VfaeK"},{"type":"inlineMath","value":"Q","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"html":"QQQ","key":"agOWFMjFF5"},{"type":"text","value":"-function (or advantage function) of the current policy;","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"LsFXK7q3ZG"}],"key":"ojK1KRLfSM"},{"type":"listItem","spread":true,"position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Updating the policy to be greedy w.r.t. this approximate ","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"zfWX3jWKh4"},{"type":"inlineMath","value":"Q","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"html":"QQQ","key":"E6Vl7MCxd3"},{"type":"text","value":"-function (or advantage function).","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"W8NeOxOIXc"}],"key":"UcWR0ZG4Gg"}],"key":"eb40bTtAcZ"}],"key":"tI7irWcIvR"},{"type":"paragraph","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"To analyze the difference between them, we’ll make use of the ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"cFSfNtN985"},{"type":"strong","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"performance difference lemma","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"EkjLRr1Azl"}],"key":"kL8kYYoYyh"},{"type":"text","value":", which provides an expression for comparing the difference between two value functions.","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"ngoj75en1Y"}],"key":"ym8nfed8L7"},{"type":"proof","kind":"theorem","label":"pdl","identifier":"pdl","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance difference lemma","position":{"start":{"line":549,"column":1},"end":{"line":549,"column":1}},"key":"fyLev50ZW4"}],"key":"R7kaJhXeVv"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":555,"column":1}},"children":[{"type":"text","value":"Suppose Alice is playing a game (an MDP).\nBob is spectating, and can evaluate how good an action is compared to his own strategy.\n(That is, Bob can compute his ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"YAiBQqUyBc"},{"type":"emphasis","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"children":[{"type":"text","value":"advantage function","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"yIB0ba207K"}],"key":"ZPL8i2eQLa"},{"type":"text","value":" ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"UBySJM3Ifw"},{"type":"inlineMath","value":"A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"AhBob(sh,ah)A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)AhBob(sh,ah)","key":"sv9KKYyPnG"},{"type":"text","value":").\nThe performance difference lemma says that Bob can now calculate exactly how much better or worse he is than Alice as follows:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"nNLoNuIUAF"}],"key":"uWbho1Knf9"},{"type":"math","value":"V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]","label":"pdl_eq","identifier":"pdl_eq","html":"V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]","enumerator":"6.27","html_id":"pdl-eq","key":"sMw1fWQ3JN"},{"type":"paragraph","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"QBq711RaHm"},{"type":"inlineMath","value":"\\rho_{\\text{Alice}, s}","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"html":"ρAlice,s\\rho_{\\text{Alice}, s}ρAlice,s","key":"Zu7V22Ystn"},{"type":"text","value":" denotes the distribution over trajectories starting in state ","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"RfgKLNntvR"},{"type":"inlineMath","value":"s","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"html":"sss","key":"AmBKnXc3pv"},{"type":"text","value":" when Alice is playing.","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"Sub9u9En5X"}],"key":"fsF4MHj9Tk"},{"type":"paragraph","position":{"start":{"line":564,"column":1},"end":{"line":566,"column":1}},"children":[{"type":"text","value":"To see why, consider just a single step ","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"key":"A4xpztZ0SA"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"html":"h\\hih","key":"FDtAGnPCWf"},{"type":"text","value":" of the trajectory.\nAt this step we compute how much better actions from Bob are than the actions from Alice, on average.\nBut this is exactly the average Bob-advantage across actions from Alice, as described in the PDL!","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"key":"xPLinB6EhA"}],"key":"f2gObilCqV"},{"type":"paragraph","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"children":[{"type":"text","value":"Formally, this corresponds to a nice telescoping simplification when we expand out the definition of the advantage function. Note that","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"key":"F1AW6F3EHZ"}],"key":"YXFcosjUQ1"},{"type":"math","value":"\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}","position":{"start":{"line":570,"column":1},"end":{"line":575,"column":1}},"html":"Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)","enumerator":"6.28","key":"cyHnGwSts3"},{"type":"paragraph","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"so expanding out the r.h.s. expression of ","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"ffwCcMSZZT"},{"type":"crossReference","kind":"equation","identifier":"pdl_eq","label":"pdl_eq","children":[{"type":"text","value":"(","key":"OjsAsOJOp5"},{"type":"text","value":"6.27","key":"C4xaEFRlrk"},{"type":"text","value":")","key":"EcevKJEVxl"}],"template":"(%s)","enumerator":"6.27","resolved":true,"html_id":"pdl-eq","key":"XgCsUAP0aP"},{"type":"text","value":" and grouping terms together gives","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"PAzFa8Urzt"}],"key":"OaxY3nS3LW"},{"type":"math","value":"\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}","position":{"start":{"line":579,"column":1},"end":{"line":584,"column":1}},"html":"EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)","enumerator":"6.29","key":"pdJglG1CnW"},{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"as desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"uY7Q46UwwQ"}],"key":"zDlbgNZ59o"}],"enumerator":"6.1","html_id":"pdl","key":"waMn7Y6ZLf"},{"type":"paragraph","position":{"start":{"line":589,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting.\nTo see why, let’s consider a single iteration of policy iteration, where policy ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"lopMFOebYI"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"G6ZJxtGZ66"},{"type":"text","value":" gets updated to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"lrwgNqMxJv"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"OGAT5p5lpf"},{"type":"text","value":". We’ll assume these policies are deterministic.\nSuppose the new policy ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"tdHETDeOh9"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"pMJLw2VGp1"},{"type":"text","value":" chooses some action with a negative advantage with respect to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"amyoL1xsOs"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Fto6jQLzQs"},{"type":"text","value":".\nThat is, when acting according to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"aFBYF8kP86"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"EZRjPX6NY9"},{"type":"text","value":", taking the action from ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"ewYbM33HRK"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"HqqRENGaOo"},{"type":"text","value":" would perform worse than expected.\nDefine ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"cJwtoUxDVj"},{"type":"inlineMath","value":"\\Delta_\\infty","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"Δ\\Delta_\\inftyΔ","key":"TBqvY8AglN"},{"type":"text","value":" to be the most negative advantage, that is, ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"vlKKjfkLbm"},{"type":"inlineMath","value":"\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"Δ=minsSAhπ(s,π~(s))\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))Δ=minsSAhπ(s,π~(s))","key":"YnINRbFOnk"},{"type":"text","value":".\nPlugging this into the ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"hQxjn9KCGK"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"Tcdn2cByqu"},{"type":"text","value":"6.1","key":"JVqaX08RVv"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","key":"orIVAIrVAA"},{"type":"text","value":" gives","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Rl0nojDZWt"}],"key":"THSB3zlXeH"},{"type":"math","value":"\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}","position":{"start":{"line":596,"column":1},"end":{"line":604,"column":1}},"html":"V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}V0π~(s)V0π(s)V0π~(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π(s)HΔ∣.","enumerator":"6.30","key":"H9CBSF3MGO"},{"type":"paragraph","position":{"start":{"line":606,"column":1},"end":{"line":612,"column":1}},"children":[{"type":"text","value":"That is, for some state ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"W7L5nmJMoa"},{"type":"inlineMath","value":"s","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"sss","key":"vEj0RDLyLY"},{"type":"text","value":", the lower bound on the performance of ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"sBlIBLgg0q"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"π~\\tilde \\piπ~","key":"LZrge61eNU"},{"type":"text","value":" is ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"Mw4uEXaudG"},{"type":"emphasis","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"lower","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"ed113dJjy1"}],"key":"vbfutvAIvb"},{"type":"text","value":" than the performance of ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"KJoxmMKAWJ"},{"type":"text","value":"π","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"M40wGOOm4H"},{"type":"text","value":".\nThis doesn’t state that ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"vhTNrxZwwR"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"π~\\tilde \\piπ~","key":"ikNRkuzkDq"},{"type":"text","value":" ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"bGZxvLa92x"},{"type":"emphasis","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"will","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"DW9AstUyai"}],"key":"qvOQUSyRRE"},{"type":"text","value":" necessarily perform worse than ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"Cp0L1qQPoE"},{"type":"text","value":"π","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"zfTuer4ufo"},{"type":"text","value":",\nonly suggests that it might be possible.\nIf these worst case states do exist, though,\nPI does not avoid situations where the new policy often visits them;\nIt does not enforce that the trajectory distributions ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"PbuU78AAdw"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"HjQ2udzf6G"},{"type":"text","value":" and ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"uQKg25oS8u"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"GtT6TsYAWu"},{"type":"text","value":" be close to each other.\nIn other words, the “training distribution” that our prediction rule is fitted on, ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"IquPEwbIOi"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"AjTdHoGxmr"},{"type":"text","value":", may differ significantly from the “evaluation distribution” ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"ewhuHB7N2u"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"nTn9FHy5vC"},{"type":"text","value":".","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"pKCTMz5coG"}],"key":"o588SbzEQL"},{"type":"comment","value":" \nThis is an instance of *distributional shift*.\nTo begin, let's ask, where *do* fitted approaches work well?\nThey are commonly seen in SL,\nwhere a prediction rule is fit using some labelled training set,\nand then assessed on a test set from the same distribution.\nBut policy iteration isn't performed in the same scenario:\nthere is now _distributional shift_ between the different iterations of the policy. ","key":"MdUpOEEbsv"},{"type":"paragraph","position":{"start":{"line":623,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"On the other hand, policy gradient methods ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"qJpiXsxJ0u"},{"type":"emphasis","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"do","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"x43y2HAGng"}],"key":"uxyNfR1tMf"},{"type":"text","value":", albeit implicitly,\nencourage ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"GRPjz8PywH"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"alKqfTWFkd"},{"type":"text","value":" and ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"ioKHTnYcl3"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"l5XoRY1fmw"},{"type":"text","value":" to be similar.\nSuppose that the mapping from policy parameters to trajectory distributions is relatively smooth.\nThen, by adjusting the parameters only a small distance,\nthe new policy will also have a similar trajectory distribution.\nBut this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth.\nCan we constrain the distance between the resulting distributions more ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"BSuGF4qwBA"},{"type":"emphasis","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"sYfEkGUkyA"}],"key":"T1BZyEhaaD"},{"type":"text","value":"?","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"MSyWFqoYYU"}],"key":"ADYyY5ly7g"},{"type":"paragraph","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"children":[{"type":"text","value":"This brings us to the next three methods:","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"uRPjb9uNX3"}],"key":"HTMEsKYXJg"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":632,"column":1},"end":{"line":635,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"strong","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"trust region policy optimization","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"key":"XyYnD11EaZ"}],"key":"tusaKicIWb"},{"type":"text","value":" (TRPO), which explicitly constrains the difference between the distributions before and after each step;","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"key":"K3LvdUhvb8"}],"key":"tjDGC1t5St"},{"type":"listItem","spread":true,"position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"the ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"EYwNThXn18"},{"type":"strong","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"sIADJhigbp"}],"key":"UWEEsBztDC"},{"type":"text","value":" (NPG), a first-order approximation of TRPO;","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"DkSAoGAlio"}],"key":"oa8bgm49JN"},{"type":"listItem","spread":true,"position":{"start":{"line":634,"column":1},"end":{"line":635,"column":1}},"children":[{"type":"strong","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"proximal policy optimization","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"oloBtCxud8"}],"key":"BB6OqUmL6g"},{"type":"text","value":" (PPO), a “soft relaxation” of TRPO.","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"JCIWtOHDi8"}],"key":"uhNCVKdbHV"}],"key":"MTOB7Wb9t1"}],"key":"DJljn9qRoD"},{"type":"block","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":638,"column":1},"end":{"line":638,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":638,"column":1},"end":{"line":638,"column":1}},"key":"a5shkyFS4B"}],"identifier":"trust-region-policy-optimization","label":"Trust region policy optimization","html_id":"trust-region-policy-optimization","implicit":true,"enumerator":"6.7","key":"B4TsOsO8JI"},{"type":"paragraph","position":{"start":{"line":640,"column":1},"end":{"line":644,"column":1}},"children":[{"type":"text","value":"We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration.\nCan we design an algorithm that ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"LQLt2oHt9H"},{"type":"emphasis","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"llZ5m44PTy"}],"key":"SzvKB6ycIo"},{"type":"text","value":" constrains the “step size”?\nThat is, we want to ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"GMzRhgNRaT"},{"type":"emphasis","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"ESJ7SILqUr"}],"key":"r5e4h52EG0"},{"type":"text","value":" the policy as much as possible,\nmeasured in terms of the r.h.s. of the ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"UugPTtmNdn"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"dePq2wF7J5"},{"type":"text","value":"6.1","key":"xHdLEFF47P"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","key":"TNx1VWf00L"},{"type":"text","value":",\nwhile ensuring that its trajectory distribution does not change too much:","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"UvYgR1vBOg"}],"key":"tCr5FbGS2o"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}","position":{"start":{"line":646,"column":1},"end":{"line":651,"column":1}},"html":"θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}θk+1argθoptmaxEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ","enumerator":"6.31","key":"Sv1Ndc3gXW"},{"type":"paragraph","position":{"start":{"line":653,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"Note that we have made a small change to the r.h.s. expression:\nwe use the ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"uYvXOPhYyj"},{"type":"emphasis","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"states","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"aX5r6zIWVn"}],"key":"AaK28hdGYL"},{"type":"text","value":" sampled from the old policy, and only use the ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"tDU8xgiXdb"},{"type":"emphasis","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"BrGVhoNuwx"}],"key":"xBG7zDoxNa"},{"type":"text","value":" from the new policy.\nIt would be computationally infeasible to sample entire trajectories from ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"XNNRYj6IYk"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"MGlkx5UXFR"},{"type":"text","value":" as we are optimizing over ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"eedO4zmUFR"},{"type":"text","value":"θ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"R68NSRA0yH"},{"type":"text","value":".\nOn the other hand, if ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"V8IxqHGhRB"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"SszfxK8ClU"},{"type":"text","value":" returns a vector representing a probability distribution over actions,\nthen evaluating the expected advantage with respect to this distribution only requires taking a dot product.\nThis approximation also matches the r.h.s. of the PDL to first order in ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"r3hhajU8rG"},{"type":"text","value":"θ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"USAxLRVX1K"},{"type":"text","value":".\n(We will elaborate more on this later.)","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"D3YjbE844c"}],"key":"H5dcPkUcyf"},{"type":"paragraph","position":{"start":{"line":661,"column":1},"end":{"line":662,"column":1}},"children":[{"type":"text","value":"How do we describe the distance between ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"j2aula9wJo"},{"type":"inlineMath","value":"\\rho_{\\theta^{\\text{opt}}}","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"html":"ρθopt\\rho_{\\theta^{\\text{opt}}}ρθopt","key":"F3LyeF8zlS"},{"type":"text","value":" and ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"jVUQQ8PUzr"},{"type":"inlineMath","value":"\\rho_{\\theta^k}","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"html":"ρθk\\rho_{\\theta^k}ρθk","key":"KE6UxUjUKO"},{"type":"text","value":"?\nWe’ll use the ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"Ib7d8lJSk4"},{"type":"strong","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence (KLD)","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"ogYAnI6900"}],"key":"pRNKRAPx7x"},{"type":"text","value":":","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"BbdY8QzNkQ"}],"key":"utCVCR9rsq"},{"type":"proof","kind":"definition","label":"kld","identifier":"kld","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"y3OoOv8BKz"}],"key":"KUBBHLApTM"},{"type":"paragraph","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"For two PDFs ","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"key":"GdZDLMlbYj"},{"type":"inlineMath","value":"p, q","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"html":"p,qp, qp,q","key":"HhazTqpSwZ"},{"type":"text","value":",","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"key":"EzyVWdPzps"}],"key":"bIGRFx9GVs"},{"type":"math","value":"\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]","position":{"start":{"line":669,"column":1},"end":{"line":669,"column":1}},"html":"KL(pq):=Exp[logp(x)q(x)]\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]KL(pq):=Exp[logq(x)p(x)]","enumerator":"6.32","key":"XCeLcJBvnR"},{"type":"paragraph","position":{"start":{"line":671,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"This can be interpreted in many different ways, many stemming from information theory.\nOne such interpretation is that ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"QBgebzV56Q"},{"type":"inlineMath","value":"\\kl{p}{q}","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"KL(pq)\\kl{p}{q}KL(pq)","key":"EQ5gcfEPtG"},{"type":"text","value":" describes my average “surprise” if I ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"aVg8Wh3bQx"},{"type":"emphasis","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"think","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"GMiNaJdIQH"}],"key":"B5KGUP40KX"},{"type":"text","value":" data is being generated by ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"sJcR93Ww0J"},{"type":"inlineMath","value":"q","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"qqq","key":"KrN8Q5H5SS"},{"type":"text","value":" but it’s actually generated by ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"hxtbPH93EN"},{"type":"inlineMath","value":"p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"ppp","key":"uJmjLUPEHA"},{"type":"text","value":".\n(The ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"u1AuzGmj2q"},{"type":"strong","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"surprise","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"qHTj52I4Ey"}],"key":"F0JkvCW5jt"},{"type":"text","value":" of an event with probability ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"G9zOHAO4a5"},{"type":"inlineMath","value":"p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"ppp","key":"jOnHCbeVJm"},{"type":"text","value":" is ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"js9GP9Tfef"},{"type":"inlineMath","value":"- \\log_2 p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"log2p- \\log_2 plog2p","key":"R3Nvrm3pu3"},{"type":"text","value":".)\nNote that ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"zBzZiTek79"},{"type":"inlineMath","value":"\\kl{p}{q} = 0","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"KL(pq)=0\\kl{p}{q} = 0KL(pq)=0","key":"qWWLmMqG53"},{"type":"text","value":" if and only if ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"ZIQtrvVekR"},{"type":"inlineMath","value":"p = q","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"p=qp = qp=q","key":"Xn2aj1dfvT"},{"type":"text","value":". Also note that it is generally ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"s8DWnqwsES"},{"type":"emphasis","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Bde2SnOFYC"}],"key":"rKqr2UrJHU"},{"type":"text","value":" symmetric.","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"zHqyYefOyV"}],"key":"WfWBpLFSNh"}],"enumerator":"6.3","html_id":"kld","key":"BuYSm2vNOu"},{"type":"paragraph","position":{"start":{"line":677,"column":1},"end":{"line":680,"column":1}},"children":[{"type":"text","value":"Both the objective function and the KLD constraint involve a weighted average over the space of all trajectories.\nThis is intractable in general, so we need to estimate the expectation.\nAs before, we can do this by taking an empirical average over samples from the trajectory distribution.\nThis gives us the following pseudocode:","position":{"start":{"line":677,"column":1},"end":{"line":677,"column":1}},"key":"OjBQdEbZ9D"}],"key":"ARFxaSr3wJ"},{"type":"proof","kind":"definition","label":"trpo","identifier":"trpo","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trust region policy optimization (exact)","position":{"start":{"line":682,"column":1},"end":{"line":682,"column":1}},"key":"cdjVDeq79X"}],"key":"hP31xF1ue7"},{"type":"code","lang":"python","value":"def trpo_pseudocode(env, δ, θ_init, M):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), M)\n A_hat = fit(trajectories)\n \n def approximate_gain(θ_):\n total_advantage = 0\n for τ in trajectories:\n for s, _a, _r in τ:\n for a in env.action_space:\n total_advantage += π(θ)(s, a) * A_hat(s, a)\n return total_advantage\n \n def constraint(θ_):\n kl_div = 0\n for τ in trajectories:\n for s, a, _r in τ:\n kl_div += jnp.log(π(θ)(s, a)) - jnp.log(π(θ_)(s, a))\n return kl_div <= δ\n \n θ = optimize(approximate_gain, constraint)\n\n return θ","position":{"start":{"line":686,"column":1},"end":{"line":711,"column":1}},"key":"mKuFSFUCAm"}],"enumerator":"6.4","html_id":"trpo","key":"UbYVxmyN9a"},{"type":"comment","value":"\nApplying importance sampling allows us to estimate the TRPO objective as follows:\n\n::::{prf:definition} Trust region policy optimization (implementation)\n:label: trpo_implement\n\n:::{prf:definitionic} TODO\nInitialize $\\theta^0$\n\nSample $N$ trajectories from $\\rho^k$ to learn a value estimator $\\tilde b_\\hi(s) \\approx V^{\\pi^k}_\\hi(s)$\n\nSample $M$ trajectories $\\tau_0, \\dots, \\tau_{M-1} \\sim \\rho^k$\n\n$$\\begin{gathered}\n \\theta^{k+1} \\gets \\arg\\max_{\\theta} \\frac{1}{M} \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} [ R_\\hi(\\tau_m) - \\tilde b_\\hi(s_\\hi) ] \\\\\n \\text{where } \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\frac{\\pi_k(a_\\hi^m \\mid s_\\hi^m)}{\\pi_\\theta(a_\\hi^m \\mid s_\\hi^m)} \\le \\delta\n \n\\end{gathered}$$\n:::\n:::: ","key":"h3Iz8AXiND"},{"type":"paragraph","position":{"start":{"line":735,"column":1},"end":{"line":742,"column":1}},"children":[{"type":"text","value":"The above isn’t entirely complete:\nwe still need to solve the actual optimization problem at each step.\nUnless we know additional properties of the problem,\nthis might be an intractable optimization.\nDo we need to solve it exactly, though?\nInstead, if we assume that both the objective function and the constraint are somewhat smooth in terms of the policy parameters,\nwe can use their ","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"fyIo5IRRKn"},{"type":"emphasis","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"children":[{"type":"text","value":"Taylor expansions","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"f8kKd8voR1"}],"key":"mY4Qi3YWgS"},{"type":"text","value":" to give us a simpler optimization problem with a closed-form solution.\nThis brings us to the ","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"nlCrfMa5LW"},{"type":"strong","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"sR2FeLkIAD"}],"key":"XfcGtyOgEd"},{"type":"text","value":" algorithm.","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"NjD1Hdr95R"}],"key":"RPDsqjG93z"}],"key":"mYsJTFsOjF"},{"type":"block","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":746,"column":1},"end":{"line":746,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":746,"column":1},"end":{"line":746,"column":1}},"key":"uB6M0AqUVb"}],"identifier":"natural-policy-gradient","label":"Natural policy gradient","html_id":"natural-policy-gradient","implicit":true,"enumerator":"6.8","key":"zMW9aBGrGe"},{"type":"paragraph","position":{"start":{"line":748,"column":1},"end":{"line":749,"column":1}},"children":[{"type":"text","value":"We take a ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"w03kLnORrp"},{"type":"emphasis","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"cg4Ys31M1z"}],"key":"W9rDGuQeXK"},{"type":"text","value":" (first-order) approximation to the objective function and a ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"qvjDtOZ3zJ"},{"type":"emphasis","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"children":[{"type":"text","value":"quadratic","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"SfU70Dr22c"}],"key":"BE2YjVBLEj"},{"type":"text","value":" (second-order) approximation to the KL divergence constraint about the current estimate ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"L8XYqH5UQu"},{"type":"inlineMath","value":"\\theta^k","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"html":"θk\\theta^kθk","key":"joFQzxAsZS"},{"type":"text","value":".\nThis results in the optimization problem","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"NTv5fFWgFI"}],"key":"UAW9HYsWvu"},{"type":"math","value":"\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}","label":"npg_optimization","identifier":"npg_optimization","html":"maxθθJ(πθk)(θθk)where 12(θθk)Fθk(θθk)δ\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}θmaxθJ(πθk)(θθk)where 21(θθk)Fθk(θθk)δ","enumerator":"6.33","html_id":"npg-optimization","key":"pjzAj9F3oc"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"TwFAidMQAB"},{"type":"inlineMath","value":"F_{\\theta^k}","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"FθkF_{\\theta^k}Fθk","key":"cuKH30Kg8q"},{"type":"text","value":" is the ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"KBhs3Hm8je"},{"type":"strong","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"sEaCMwksfP"}],"key":"iOMrH1c72P"},{"type":"text","value":" defined below.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"fAs2SRoonD"}],"key":"R0FtXP1CeI"},{"type":"proof","kind":"definition","label":"fisher_matrix","identifier":"fisher_matrix","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"oSBDZpYGIX"}],"key":"Mtv96d5PAY"},{"type":"paragraph","position":{"start":{"line":765,"column":1},"end":{"line":766,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"a9ddMUPEfB"},{"type":"inlineMath","value":"p_\\theta","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"pθp_\\thetapθ","key":"hTeV5wPOb7"},{"type":"text","value":" denote a parameterized distribution.\nIts Fisher information matrix ","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"Ry16Mdmu2p"},{"type":"inlineMath","value":"F_\\theta","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"FθF_\\thetaFθ","key":"zj24XhlyQ5"},{"type":"text","value":" can be defined equivalently as:","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"cjNZrsECna"}],"key":"qFNHjeMxAQ"},{"type":"math","value":"\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}","position":{"start":{"line":768,"column":1},"end":{"line":773,"column":1}},"html":"Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]covariance matrix of the Fisher score=Expθ[θ2logpθ(x)]average Hessian of the negative log-likelihood\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]=Expθ[θ2logpθ(x)]covariance matrix of the Fisher scoreaverage Hessian of the negative log-likelihood","enumerator":"6.34","key":"cPCba7SkA2"},{"type":"paragraph","position":{"start":{"line":775,"column":1},"end":{"line":778,"column":1}},"children":[{"type":"text","value":"Recall that the Hessian of a function describes its curvature:\nfor a vector ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"IDFXZOEUXZ"},{"type":"inlineMath","value":"\\delta \\in \\Theta","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"html":"δΘ\\delta \\in \\ThetaδΘ","key":"fGJagpoQRa"},{"type":"text","value":",\nthe quantity ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"tLViLk4H92"},{"type":"inlineMath","value":"\\delta^\\top F_\\theta \\delta","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"html":"δFθδ\\delta^\\top F_\\theta \\deltaδFθδ","key":"inkyF58aRR"},{"type":"text","value":" describes how rapidly the negative log-likelihood changes if we move by ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"ENItOTql1V"},{"type":"text","value":"δ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"YoBcyJ58Sy"},{"type":"text","value":".\nThe Fisher information matrix is precisely the Hessian of the KL divergence (with respect to either one of the parameters).","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"gFUZJWVEij"}],"key":"f51fk5eTW9"},{"type":"paragraph","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"children":[{"type":"text","value":"In particular, when ","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"key":"NLdQDnXmKv"},{"type":"inlineMath","value":"p_\\theta = \\rho_{\\theta}","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"html":"pθ=ρθp_\\theta = \\rho_{\\theta}pθ=ρθ","key":"aTbJ9mnw3P"},{"type":"text","value":" denotes a trajectory distribution, we can further simplify the expression:","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"key":"pGPswynlIx"}],"key":"dWIlsu9WYh"},{"type":"math","value":"F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]","label":"fisher_trajectory","identifier":"fisher_trajectory","html":"Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]","enumerator":"6.35","html_id":"fisher-trajectory","key":"abNjynSJoE"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"children":[{"type":"text","value":"Note that we’ve used the Markov property to cancel out the cross terms corresponding to two different time steps.","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"Y9stPfvbEP"}],"key":"qDHx5ASprL"}],"enumerator":"6.5","html_id":"fisher-matrix","key":"JhUywnxCPx"},{"type":"paragraph","position":{"start":{"line":791,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"This is a convex optimization problem with a closed-form solution.\nTo see why, it helps to visualize the case where ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"htocAb42VJ"},{"type":"text","value":"θ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"vU5D5SKhsV"},{"type":"text","value":" is two-dimensional:\nthe constraint describes the inside of an ellipse,\nand the objective function is linear,\nso we can find the extreme point on the boundary of the ellipse.\nWe recommend ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"aYqtMQe4bb"},{"type":"cite","kind":"narrative","label":"boyd_convex_2004","identifier":"boyd_convex_2004","children":[{"type":"text","value":"Boyd & Vandenberghe (2004)","key":"Qm4I7XLIUS"}],"enumerator":"1","key":"zkqp7HbtkI"},{"type":"text","value":" for a comprehensive treatment of convex optimization.","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"LJVaBabR6C"}],"key":"mnipl0BKWf"},{"type":"paragraph","position":{"start":{"line":798,"column":1},"end":{"line":799,"column":1}},"children":[{"type":"text","value":"More generally, for a higher-dimensional ","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"zxL8G7VgW1"},{"type":"text","value":"θ","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"Q44nz39578"},{"type":"text","value":",\nwe can compute the global optima by setting the gradient of the Lagrangian to zero:","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"Sabmo2Huo5"}],"key":"fhOXP5Dhy1"},{"type":"math","value":"\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}","position":{"start":{"line":801,"column":1},"end":{"line":809,"column":1}},"html":"L(θ,α)=J(πθk)(θθk)α[12(θθk)Fθk(θθk)δ]L(θk+1,α):=0    J(πθk)=αFθk(θk+1θk)θk+1=θk+ηFθk1J(πθk)where η=2δJ(πθk)Fθk1J(πθk)\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}L(θ,α)L(θk+1,α)J(πθk)θk+1where η=J(πθk)(θθk)α[21(θθk)Fθk(θθk)δ]:=0=αFθk(θk+1θk)=θk+ηFθk1J(πθk)=J(πθk)Fθk1J(πθk)2δ","enumerator":"6.36","key":"jRJUeDHnSC"},{"type":"paragraph","position":{"start":{"line":811,"column":1},"end":{"line":813,"column":1}},"children":[{"type":"text","value":"This gives us the closed-form update.\nNow the only challenge is to estimate the Fisher information matrix,\nsince, as with the KL divergence constraint, it is an expectation over trajectories, and computing it exactly is therefore typically intractable.","position":{"start":{"line":811,"column":1},"end":{"line":811,"column":1}},"key":"LlVjF5n4GB"}],"key":"SO9mQbpBT5"},{"type":"proof","kind":"definition","label":"npg","identifier":"npg","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"cplpYZdZEE"}],"key":"Og3dwfRGT8"},{"type":"paragraph","position":{"start":{"line":818,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"How many trajectory samples do we need to accurately estimate the Fisher information matrix?\nAs a rule of thumb, the sample complexity should scale with the dimension of the parameter space.\nThis makes this approach intractable in the deep learning setting where we might have a very large number of parameters.","position":{"start":{"line":818,"column":1},"end":{"line":818,"column":1}},"key":"Z1NCEjQMrN"}],"key":"p2HFGHE0sJ"}],"enumerator":"6.6","html_id":"npg","key":"Zg2w5tLq0q"},{"type":"paragraph","position":{"start":{"line":823,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"As you can see, the NPG is the “basic” policy gradient algorithm we saw above,\nbut with the gradient transformed by the inverse Fisher information matrix.\nThis matrix can be understood as accounting for the ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"XQUJPndh4P"},{"type":"strong","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"geometry of the parameter space.","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"jU3eiZ37Bz"}],"key":"MwzsevSoVQ"},{"type":"text","value":"\nThe typical gradient descent algorithm implicitly measures distances between parameters using the typical ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"bbUa0mu12I"},{"type":"emphasis","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Euclidean distance","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"fiuR7OIAzT"}],"key":"E2i0FFmepI"},{"type":"text","value":".\nHere, where the parameters map to a ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"jNCGeJbEDg"},{"type":"emphasis","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"distribution","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"Xlr1aAhRHM"}],"key":"SRHEsWxHJ7"},{"type":"text","value":", using the natural gradient update is equivalent to optimizing over ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"TWR46lcmvD"},{"type":"strong","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"distribution space","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"wu7Zl5DTmc"}],"key":"padNyTeBLP"},{"type":"text","value":" rather than parameter space,\nwhere distance between distributions is measured by the ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"VeTPzEIjK0"},{"type":"crossReference","kind":"proof:definition","identifier":"kld","label":"kld","children":[{"type":"text","value":"Definition ","key":"HAaCkJNioK"},{"type":"text","value":"6.3","key":"CdmqMG4eLm"}],"template":"Definition %s","enumerator":"6.3","resolved":true,"html_id":"kld","key":"UvNDjFdKUe"},{"type":"text","value":".","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"sMGDbQG91r"}],"key":"T2GaK5TaXC"},{"type":"proof","kind":"example","label":"natural_simple","identifier":"natural_simple","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural gradient on a simple problem","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"mLQNtFZTKO"}],"key":"QUSrESjfAH"},{"type":"paragraph","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"children":[{"type":"text","value":"Let’s step away from RL and consider the following optimization problem over Bernoulli distributions ","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"key":"F394TFERoF"},{"type":"inlineMath","value":"\\pi \\in \\Delta(\\{ 0, 1 \\})","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"html":"πΔ({0,1})\\pi \\in \\Delta(\\{ 0, 1 \\})πΔ({0,1})","key":"sjDZsmjmus"},{"type":"text","value":":","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"key":"K1qWbTbpah"}],"key":"Xck3Qs3xQS"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}","position":{"start":{"line":835,"column":1},"end":{"line":839,"column":1}},"html":"J(π)=100π(1)+1π(0)\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}J(π)=100π(1)+1π(0)","enumerator":"6.37","key":"aSsLIaMwu9"},{"type":"paragraph","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"We can think of the space of such distributions as the line between ","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"apfPcz4HIo"},{"type":"inlineMath","value":"(0, 1)","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"html":"(0,1)(0, 1)(0,1)","key":"MYOMVluSr8"},{"type":"text","value":" to ","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"Lcki2IbIke"},{"type":"inlineMath","value":"(1, 0)","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"html":"(1,0)(1, 0)(1,0)","key":"PCURPHRys2"},{"type":"text","value":" on the Cartesian plane:","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"htX4t1KvIv"}],"key":"VBCEvz7uQw"},{"type":"image","url":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","alt":"a line from (0, 1) to (1, 0)","width":"240px","align":"center","key":"rOfEYdJiCq","urlSource":"shared/npg_line.png","urlOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp"},{"type":"paragraph","position":{"start":{"line":849,"column":1},"end":{"line":851,"column":1}},"children":[{"type":"text","value":"Clearly the optimal distribution is the constant one ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"HSaPMfJebR"},{"type":"inlineMath","value":"\\pi(1) = 1","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"html":"π(1)=1\\pi(1) = 1π(1)=1","key":"wZ0zxvgmRL"},{"type":"text","value":". Suppose we optimize over the parameterized family ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"zTGeIi7Yt2"},{"type":"inlineMath","value":"\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"html":"πθ(1)=exp(θ)1+exp(θ)\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}πθ(1)=1+exp(θ)exp(θ)","key":"xWoeeG78XD"},{"type":"text","value":".\nThen our optimization algorithm should set ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"Z57vYbP5GY"},{"type":"text","value":"θ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"dKdh4gyc3N"},{"type":"text","value":" to be unboundedly large.\nThen the “vanilla” gradient is","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"omRH1yM0pI"}],"key":"oSdEaO7ev9"},{"type":"math","value":"\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.","position":{"start":{"line":853,"column":1},"end":{"line":853,"column":1}},"html":"θJ(πθ)=99exp(θ)(1+exp(θ))2.\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.θJ(πθ)=(1+exp(θ))299exp(θ).","enumerator":"6.38","key":"cytaNCrAYT"},{"type":"paragraph","position":{"start":{"line":855,"column":1},"end":{"line":856,"column":1}},"children":[{"type":"text","value":"Note that as ","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"tRDlIVmjTU"},{"type":"inlineMath","value":"\\theta \\to \\infty","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"html":"θ\\theta \\to \\inftyθ","key":"cvl7kCYRRP"},{"type":"text","value":" that the increments get closer and closer to ","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"Ji1bvEWRjW"},{"type":"text","value":"0","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"vIF6lgDXm8"},{"type":"text","value":";\nthe rate of increase becomes exponentially slow.","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"cstwyROAfC"}],"key":"L3IWJ9xCrZ"},{"type":"paragraph","position":{"start":{"line":859,"column":1},"end":{"line":859,"column":1}},"children":[{"type":"text","value":"However, if we compute the Fisher information “matrix” (which is just a scalar in this case), we can account for the geometry induced by the parameterization.","position":{"start":{"line":859,"column":1},"end":{"line":859,"column":1}},"key":"lk4ng1uE3r"}],"key":"YcTEjfLa0D"},{"type":"math","value":"\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}","position":{"start":{"line":861,"column":1},"end":{"line":866,"column":1}},"html":"Fθ=Exπθ[(θlogπθ(x))2]=exp(θ)(1+exp(θ))2.\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}Fθ=Exπθ[(θlogπθ(x))2]=(1+exp(θ))2exp(θ).","enumerator":"6.39","key":"oSKMS5pxwg"},{"type":"paragraph","position":{"start":{"line":868,"column":1},"end":{"line":868,"column":1}},"children":[{"type":"text","value":"This gives the natural gradient update","position":{"start":{"line":868,"column":1},"end":{"line":868,"column":1}},"key":"q7xSV7pCJO"}],"key":"ncORrf3Udr"},{"type":"math","value":"\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}","position":{"start":{"line":870,"column":1},"end":{"line":875,"column":1}},"html":"θk+1=θk+ηFθk1θJ(θk)=θk+99η\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}θk+1=θk+ηFθk1θJ(θk)=θk+99η","enumerator":"6.40","key":"X83evQqfaC"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"which increases at a constant rate, i.e. improves the objective more quickly than “vanilla” gradient ascent.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"QzY4xPYA0a"}],"key":"PcKjipYkij"}],"enumerator":"6.1","html_id":"natural-simple","key":"bzp5T8v1JI"},{"type":"paragraph","position":{"start":{"line":880,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"Though the NPG now gives a closed-form optimization step,\nit requires computing the inverse Fisher information matrix,\nwhich typically scales as ","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"BP6Lxuok15"},{"type":"inlineMath","value":"O((\\dim \\Theta)^3)","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"html":"O((dimΘ)3)O((\\dim \\Theta)^3)O((dimΘ)3)","key":"wdRSlkOvVA"},{"type":"text","value":".\nThis can be expensive if the parameter space is large.\nCan we find an algorithm that works in ","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"zQqj7WlxmH"},{"type":"emphasis","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"children":[{"type":"text","value":"linear time","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"fZc4DW4D7u"}],"key":"kPV9naAKAH"},{"type":"text","value":" with respect to the dimension of the parameter space?","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"sJoUyuzGVm"}],"key":"vD8IfNRM18"}],"key":"MI4MhLDtmN"},{"type":"block","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":888,"column":1},"end":{"line":888,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":888,"column":1},"end":{"line":888,"column":1}},"key":"bTIsI9crvw"}],"identifier":"proximal-policy-optimization","label":"Proximal policy optimization","html_id":"proximal-policy-optimization","implicit":true,"enumerator":"6.9","key":"TtggvmfmOV"},{"type":"paragraph","position":{"start":{"line":890,"column":1},"end":{"line":892,"column":1}},"children":[{"type":"text","value":"We can relax the TRPO optimization problem in a different way:\nRather than imposing a hard constraint on the KL distance,\nwe can instead impose a ","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"puQVE1vktY"},{"type":"emphasis","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"children":[{"type":"text","value":"soft","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"YYJYvS1VO9"}],"key":"OLj64AEOlQ"},{"type":"text","value":" constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"Qh6YzIjlkz"}],"key":"MPqKgc6rVn"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}","position":{"start":{"line":894,"column":1},"end":{"line":898,"column":1}},"html":"θk+1argmaxθEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}θk+1argθmaxEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)","enumerator":"6.41","key":"s1Le17LQXu"},{"type":"paragraph","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"Here ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"p8Yh88Z7gB"},{"type":"text","value":"λ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"jPJ8Hxmc2a"},{"type":"text","value":" is a ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"LPcha4xPPf"},{"type":"strong","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"regularization hyperparameter","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"L3aCLepWXf"}],"key":"wxX6Do7T4v"},{"type":"text","value":" that controls the tradeoff between the two terms.","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"XFKiWIAxKu"}],"key":"qNogE1lroj"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Like the original TRPO algorithm ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"T8SsZ9334S"},{"type":"crossReference","kind":"proof:definition","identifier":"trpo","label":"trpo","children":[{"type":"text","value":"Definition ","key":"wkXdJ24e0s"},{"type":"text","value":"6.4","key":"yf6oF7z24K"}],"template":"Definition %s","enumerator":"6.4","resolved":true,"html_id":"trpo","key":"ppfDUWeC9P"},{"type":"text","value":", PPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"sqfJW4sCCp"}],"key":"yo9XhvJwXs"},{"type":"paragraph","position":{"start":{"line":904,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"How do we solve this optimization?\nLet us begin by simplifying the ","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"key":"GE6HYWmXEI"},{"type":"inlineMath","value":"\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"html":"KL(ρπkρπθ)\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}KL(ρπkρπθ)","key":"CT0zSXLNfU"},{"type":"text","value":" term. Expanding gives","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"key":"IyNgNpWHKf"}],"key":"MyChMqGBC1"},{"type":"math","value":"\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}","position":{"start":{"line":907,"column":1},"end":{"line":913,"column":1}},"html":"KL(ρπkρπθ)=Eτρπk[logρπk(τ)ρπθ(τ)]=Eτρπk[h=0H1logπk(ahsh)πθ(ahsh)]state transitions cancel=Eτρπk[h=0H1log1πθ(ahsh)]+c\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}KL(ρπkρπθ)=Eτρπk[logρπθ(τ)ρπk(τ)]=Eτρπk[h=0H1logπθ(ahsh)πk(ahsh)]=Eτρπk[h=0H1logπθ(ahsh)1]+cstate transitions cancel","enumerator":"6.42","key":"cmCNkpgY4o"},{"type":"paragraph","position":{"start":{"line":915,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"Y9XP4jxWcl"},{"type":"inlineMath","value":"c","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"html":"ccc","key":"hzTu3DEsNN"},{"type":"text","value":" is some constant with respect to ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"LghCpIOsvA"},{"type":"text","value":"θ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"yUwtKbofes"},{"type":"text","value":", and can be ignored.\nThis gives the objective","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"ynYjTktsav"}],"key":"EPcybrksxi"},{"type":"math","value":"\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]","position":{"start":{"line":918,"column":1},"end":{"line":922,"column":1}},"html":"k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1log1πθ(ahsh)]\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1logπθ(ahsh)1]","enumerator":"6.43","key":"RI3bW5YmRu"},{"type":"paragraph","position":{"start":{"line":924,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"text","value":"Once again, this takes an expectation over trajectories.\nBut here we cannot directly sample trajectories from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"o6N1498ig2"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πk\\pi^kπk","key":"x26J3rDVku"},{"type":"text","value":",\nsince in the first term, the actions actually come from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"TPQL0ZBZNG"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"PuWGKX9bRT"},{"type":"text","value":".\nTo make this term line up with the other expectation,\nwe would need the actions to also come from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"NoWF4iijHx"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πk\\pi^kπk","key":"T7asks9Ti3"},{"type":"text","value":".","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"Y15QUUTspi"}],"key":"QYaZ9yDpbq"},{"type":"paragraph","position":{"start":{"line":930,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"This should sound familiar:\nwe want to estimate an expectation over one distribution by sampling from another.\nWe can once again use ","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"tla9CIbsWN"},{"type":"crossReference","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Section ","key":"HSeFyrAEqd"},{"type":"text","value":"6.3.3","key":"RYfzCBkIOW"}],"identifier":"importance_sampling","label":"importance_sampling","kind":"heading","template":"Section %s","enumerator":"6.3.3","resolved":true,"html_id":"importance-sampling","key":"UaYSG6dnyW"},{"type":"text","value":" to rewrite the inner expectation:","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"EbxIWECDfl"}],"key":"QbHlg4Co8o"},{"type":"math","value":"\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)","position":{"start":{"line":934,"column":1},"end":{"line":938,"column":1}},"html":"Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πθ(ahsh)πk(ahsh)Aπk(sh,ah)\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πk(ahsh)πθ(ahsh)Aπk(sh,ah)","enumerator":"6.44","key":"hztvuSXIWS"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"Now we can combine the expectations together to get the objective","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"UyG6f2Ffpz"}],"key":"KcWF3rQuTE"},{"type":"math","value":"\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]","position":{"start":{"line":942,"column":1},"end":{"line":944,"column":1}},"html":"k(θ)=Eτρπk[h=0H1(πθ(ahsh)πk(ahsh)Aπk(sh,ah)λlog1πθ(ahsh))]\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]k(θ)=Eτρπk[h=0H1(πk(ahsh)πθ(ahsh)Aπk(sh,ah)λlogπθ(ahsh)1)]","enumerator":"6.45","key":"VK16ZtDAUm"},{"type":"paragraph","position":{"start":{"line":946,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Now we can estimate this function by a sample average over trajectories from ","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"key":"G8WPCz3XMt"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"html":"πk\\pi^kπk","key":"EEw9pMpPn4"},{"type":"text","value":".\nRemember that to complete a single iteration of PPO,\nwe execute","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"key":"wX0GaiKj9i"}],"key":"frCwnXg5sv"},{"type":"math","value":"\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).","position":{"start":{"line":950,"column":1},"end":{"line":952,"column":1}},"html":"θk+1argmaxθk(θ).\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).θk+1argθmaxk(θ).","enumerator":"6.46","key":"WghV4qAlqa"},{"type":"paragraph","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"children":[{"type":"text","value":"If ","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"key":"thqo8HNpnM"},{"type":"inlineMath","value":"\\ell^k","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"html":"k\\ell^kk","key":"vwUyTCfk7X"},{"type":"text","value":" is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"key":"Qgsnrn48Zk"}],"key":"xGX67fEPBA"},{"type":"code","lang":"python","value":"def ppo_pseudocode(\n env,\n π: Callable[[Params], Callable[[State, Action], Float]],\n λ: float,\n θ_init: Params,\n n_iters: int,\n n_fit_trajectories: int,\n n_sample_trajectories: int,\n):\n θ = θ_init\n for k in range(n_iters):\n fit_trajectories = sample_trajectories(env, π(θ), n_fit_trajectories)\n A_hat = fit(fit_trajectories)\n\n sample_trajectories = sample_trajectories(env, π(θ), n_sample_trajectories)\n \n def objective(θ_opt):\n total_objective = 0\n for τ in sample_trajectories:\n for s, a, _r in τ:\n total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * jnp.log(π(θ_opt)(s, a))\n return total_objective / n_sample_trajectories\n \n θ = optimize(objective, θ)\n\n return θ","position":{"start":{"line":956,"column":1},"end":{"line":983,"column":1}},"key":"pQ9TQRhSPk"},{"type":"heading","depth":2,"position":{"start":{"line":985,"column":1},"end":{"line":985,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":985,"column":1},"end":{"line":985,"column":1}},"key":"luZdbsztwU"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"6.10","key":"FSaSkFxduA"},{"type":"paragraph","position":{"start":{"line":987,"column":1},"end":{"line":987,"column":1}},"children":[{"type":"text","value":"Policy gradient methods are a powerful family of algorithms that directly optimize the total reward by iteratively updating the policy parameters.","position":{"start":{"line":987,"column":1},"end":{"line":987,"column":1}},"key":"BDnyFYgUUs"}],"key":"Q0dTFf0aRN"},{"type":"paragraph","position":{"start":{"line":989,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"text","value":"TODO","position":{"start":{"line":989,"column":1},"end":{"line":989,"column":1}},"key":"pmHoRTPz9q"}],"key":"IHMn6E6ncd"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":991,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":991,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Vanilla policy gradient","position":{"start":{"line":991,"column":1},"end":{"line":991,"column":1}},"key":"TtvloYTvAC"}],"key":"e63zkkAp0Q"},{"type":"listItem","spread":true,"position":{"start":{"line":992,"column":1},"end":{"line":992,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":992,"column":1},"end":{"line":992,"column":1}},"key":"aFdKJsphbr"}],"key":"xHdBVuf2AY"},{"type":"listItem","spread":true,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"iDSteUYls8"}],"key":"wrxm73Tgkp"},{"type":"listItem","spread":true,"position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"key":"ZQ7xWI3rcq"}],"key":"FyXVCHkPEh"},{"type":"listItem","spread":true,"position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"JVI2wBbhlm"}],"key":"rxVQ6gAd3C"}],"key":"YRcpAp939Z"}],"key":"OLuh8qiEzT"}],"key":"cRS9RaKcns"},"references":{"cite":{"order":["boyd_convex_2004"],"data":{"boyd_convex_2004":{"label":"boyd_convex_2004","enumerator":"1","html":"Boyd, S., & Vandenberghe, L. (2004). Convex Optimization. Cambridge University Press."}}}},"footer":{"navigation":{"prev":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"47497ac56bd39ac3a823e8bfd6c4097e933a72960f31d7f469a8610a4e9554df","slug":"pg","location":"/pg.md","dependencies":[],"frontmatter":{"title":"6 Policy Gradient Methods","numbering":{"all":{"enabled":true},"enumerator":{"template":"6.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","thumbnailOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp","exports":[{"format":"md","filename":"pg.md","url":"/build/pg-955e7c04f204da0cc1efa76c01287d9f.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"o3jGXzbTjd"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"6.1","key":"jEO54wAA4v"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"The core task of RL is finding the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"hjbDRPt8Yt"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"P4NuXXuuXP"}],"key":"jCrNSOi9ec"},{"type":"text","value":" in a given environment.\nThis is essentially an ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"sQ7jEbB2Uv"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"optimization problem:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"nQlkWzLpFq"}],"key":"JDDcSZn9iv"},{"type":"text","value":"\nout of some space of policies,\nwe want to find the one that achieves the maximum total reward (in expectation).","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ixxyceibhA"}],"key":"dwm2lSiv2V"},{"type":"paragraph","position":{"start":{"line":25,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"It’s typically intractable to compute the optimal policy exactly.\nInstead, ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"FnYizRGqBG"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"policy optimization algorithms","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"tImHVhaXEH"}],"key":"Oea3642WXn"},{"type":"text","value":" start from some randomly initialized policy,\nand then ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"uWaAWj1WQc"},{"type":"emphasis","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"CM4YAZqzdZ"}],"key":"kzXLKiOsxJ"},{"type":"text","value":" it step by step.\nWe’ve already seen some examples of these,\nnamely ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"MPkmU0LcCr"},{"type":"crossReference","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Section ","key":"kHtndc9A0J"},{"type":"text","value":"1.5.3.2","key":"lqg8Cgw9Vs"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"kJQUqN9zPA"},{"type":"text","value":" for finite MDPs and ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"qpUcovrM8b"},{"type":"crossReference","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Section ","key":"wTqA7pTDrZ"},{"type":"text","value":"2.6.4","key":"DnXW3LCwjX"}],"identifier":"iterative_lqr","label":"iterative_lqr","kind":"heading","template":"Section %s","enumerator":"2.6.4","resolved":true,"html_id":"iterative-lqr","remote":true,"url":"/control","dataUrl":"/control.json","key":"ay4mDN3N1Z"},{"type":"text","value":" in continuous control.\nIn particular, we often use policies that can be described by some finite set of ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"cbpzei1fob"},{"type":"emphasis","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"parameters.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"vdKfTibWlL"}],"key":"s0xG7dwLLA"},{"type":"text","value":"\nFor such parameterized policies,\nwe can approximate the ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"cvCiErkiFl"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"policy gradient:","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"BzJ26o4Crv"}],"key":"NrRKogPiY8"},{"type":"text","value":"\nthe gradient of the expected total reward with respect to the parameters.\nThis tells us the direction the parameters should be updated to achieve a higher total reward (in expectation).\nPolicy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models,\nmany of which use policies parameterized as deep neural networks.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"FZmta5SYSI"}],"key":"faek6yzHQc"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":38,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":38,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"text","value":"We begin the chapter with a short review of gradient ascent,\na general ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"vTB51fkXne"},{"type":"strong","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"optimization method.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"XovImgHmlW"}],"key":"ubHP5pphVK"}],"key":"pEiJeWbi84"},{"type":"listItem","spread":true,"position":{"start":{"line":40,"column":1},"end":{"line":41,"column":1}},"children":[{"type":"text","value":"We’ll then see how to estimate the ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"CongDz4lmV"},{"type":"strong","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"policy gradient,","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"C1o1o117EI"}],"key":"tyPSKAgIKx"},{"type":"text","value":"\nenabling us to apply (stochastic) gradient ascent in the RL setting.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"wvcSyX4jVP"}],"key":"DPut3kB5s8"},{"type":"listItem","spread":true,"position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Then we’ll explore some ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"WiyeerH2Nj"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"proximal optimization","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"SAVrJbd2Gi"}],"key":"wJ90O1GlVC"},{"type":"text","value":" techniques that ensure the steps taken are “not too large”.\nThis is helpful to stabilize training and widely used in practice.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"I1Jk4ZGrpP"}],"key":"SpwaTU8Tmt"}],"key":"yAub2obGCJ"}],"key":"S43pYMYn8u"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import plt, Array, Callable, jax, jnp","key":"Z5PGOOb94g"},{"type":"output","id":"K2WNM5MoSfz1enig_LM4U","data":[],"key":"V4Z6YpyFA6"}],"data":{},"key":"BOOANRySE5"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":49,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"Gradient Ascent","position":{"start":{"line":49,"column":1},"end":{"line":49,"column":1}},"key":"Qf0l1qHxVf"}],"identifier":"gradient-ascent","label":"Gradient Ascent","html_id":"gradient-ascent","implicit":true,"enumerator":"6.2","key":"AJVx8Q4uVr"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"strong","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"KYKAsQj4e8"}],"key":"KF8Y9Atr6p"},{"type":"text","value":" is a general optimization algorithm for any differentiable function.\nA suitable analogy for this algorithm is hiking up a mountain,\nwhere you keep taking steps in the steepest direction upwards.\nHere, your vertical position ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"WNDKZwQoyX"},{"type":"inlineMath","value":"y","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"yyy","key":"rbHEL8gqZz"},{"type":"text","value":" is the function being optimized,\nand your horizontal position ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"bxseYRYRPG"},{"type":"inlineMath","value":"(x, z)","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"(x,z)(x, z)(x,z)","key":"nr9OZm4McR"},{"type":"text","value":" is the input to the function.\nThe ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"ZUT8SzZY5T"},{"type":"emphasis","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"uZfk0QXmKN"}],"key":"NGLx4Xljsg"},{"type":"text","value":" of the mountain at your current position is given by the ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"DLfcfDnzNZ"},{"type":"emphasis","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"gradient","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"c8v8NUICDO"}],"key":"XhEIU7ZxLX"},{"type":"text","value":",\nwritten ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"QEWZmYwZ8H"},{"type":"inlineMath","value":"\\nabla y(x, z) \\in \\mathbb{R}^2","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"y(x,z)R2\\nabla y(x, z) \\in \\mathbb{R}^2y(x,z)R2","key":"aSqMTbdRGp"},{"type":"text","value":".","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"e5GbbcHi1c"}],"key":"icOz3eDTgF"}],"key":"OOoPqh4AtD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def f(x, y):\n \"\"\"Himmelblau's function\"\"\"\n return (x**2 + y - 11)**2 + (x + y**2 - 7)**2\n\n# Create a grid of points\nx = jnp.linspace(-5, 5, 400)\ny = jnp.linspace(-5, 5, 400)\nX, Y = jnp.meshgrid(x, y)\nZ = f(X, Y)\n\n# Create the plot\nfig, ax = plt.subplots(figsize=(6, 6))\n\n# Plot the function using imshow\nimg = ax.imshow(Z, extent=[-5, 5, -5, 5], origin='lower')\n\n# Add color bar\nfig.colorbar(img, ax=ax)\n\n# Gradient computation using JAX\ntx, ty = 1.0, 1.0\ngx, gy = jax.grad(f, argnums=(0, 1))(tx, ty)\n\n# Scatter point\nax.scatter(tx, ty, color='red', s=100)\n\n# Add arrow representing the gradient\nax.arrow(tx, ty, gx * 0.01, gy * 0.01, head_width=0.3, head_length=0.3, fc='blue', ec='blue')\n\n# Add plot title\nax.set_title(\"Himmelblau's Function\")\n\nplt.show()","key":"W8aytrMqmS"},{"type":"output","id":"MBfeXe1zQiRxj0TY4xWjs","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"b8e65b5253271f49ddf227a711c3aa2c","path":"/build/b8e65b5253271f49ddf227a711c3aa2c.png"}}}],"key":"rq9Cduipxf"}],"data":{},"key":"EKeOxM21o8"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"text","value":"For differentiable functions, this can be thought of as the vector of partial derivatives,","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"jyegdFaGxX"}],"key":"EzrC6wdsG2"},{"type":"math","value":"\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.","position":{"start":{"line":97,"column":1},"end":{"line":102,"column":1}},"html":"y(x,z)=(yxyz).\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.y(x,z)=(xyzy).","enumerator":"6.1","key":"aUZ923VDL9"},{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"To calculate the ","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"lLr7CtmEkk"},{"type":"emphasis","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"gM5QZmGgR7"}],"key":"nRhOEqBdEF"},{"type":"text","value":" (aka “directional derivative”) of the mountain in a given direction ","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"OsKKbIeepa"},{"type":"inlineMath","value":"(\\Delta x, \\Delta z)","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"html":"(Δx,Δz)(\\Delta x, \\Delta z)(Δx,Δz)","key":"nwnlvlJWuv"},{"type":"text","value":",\nyou take the dot product of the difference vector with the gradient.\nThis means that the direction with the highest slope is exactly the gradient itself,\nso we can describe the gradient ascent algorithm as follows:","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"kt9x1cyoV2"}],"key":"F3fe2zjSxE"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"nTznuZz6Ji"}],"key":"aJBRQAeCKB"},{"type":"math","value":"\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})","position":{"start":{"line":110,"column":1},"end":{"line":120,"column":1}},"html":"(xk+1zk+1)=(xkzk)+ηy(xk,zk)\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})(xk+1zk+1)=(xkzk)+ηy(xk,zk)","enumerator":"6.2","key":"jIPndLqxAu"}],"enumerator":"6.1","key":"wSjcpZgZov"},{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"QoDcqR3FXO"},{"type":"inlineMath","value":"k","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"kkk","key":"czlDAzxGvh"},{"type":"text","value":" denotes the iteration of the algorithm and ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"y6rjUL0LgE"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"η>0\\eta > 0η>0","key":"rTg8mNkLqN"},{"type":"text","value":" is a “step size” hyperparameter that controls the size of the steps we take.\n(Note that we could also vary the step size across iterations, that is, ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"i8s1Bx0PzN"},{"type":"inlineMath","value":"\\eta^0, \\dots, \\eta^K","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"η0,,ηK\\eta^0, \\dots, \\eta^Kη0,,ηK","key":"HYOQTiIlNP"},{"type":"text","value":".)","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"KVRkrgkxIR"}],"key":"hzUTiuTjmW"},{"type":"paragraph","position":{"start":{"line":126,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"The case of a two-dimensional input is easy to visualize.\nBut this idea can be straightforwardly extended to higher-dimensional inputs.","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"oYVBJveW9L"}],"key":"BYcRNfRZap"},{"type":"paragraph","position":{"start":{"line":129,"column":1},"end":{"line":130,"column":1}},"children":[{"type":"text","value":"From now on, we’ll use ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"Q2ADU6u0JE"},{"type":"inlineMath","value":"J","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"JJJ","key":"NvWfjMgpet"},{"type":"text","value":" to denote the function we’re trying to maximize,\nand ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"q2sEBpyeea"},{"type":"text","value":"θ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"nVbc9JXmwz"},{"type":"text","value":" to denote the parameters being optimized over. (In the above example, ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"DLWVKql14N"},{"type":"inlineMath","value":"\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\top","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"θ=(xz)\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\topθ=(xz)","key":"nALBHRMvIZ"},{"type":"text","value":").","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"sbVo3mOpMU"}],"key":"ZhUMDHQXel"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"text","value":"Notice that our parameters will stop changing once ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"sBsdMsNX2h"},{"type":"inlineMath","value":"\\nabla J(\\theta) = 0.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"J(θ)=0.\\nabla J(\\theta) = 0.J(θ)=0.","key":"WKdQs4a5lx"},{"type":"text","value":"\nOnce we reach this ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"z24ulSBCdz"},{"type":"strong","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"stationary point,","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"yo8CqfE0tJ"}],"key":"PwDoEPwEza"},{"type":"text","value":" our current parameters are ‘locally optimal’ in some sense;\nit’s impossible to increase the function by moving in any direction.\nIf ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"LpLoP3pg6H"},{"type":"inlineMath","value":"J","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"JJJ","key":"xWAleRXTws"},{"type":"text","value":" is ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"wB8rnvJz5s"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"convex","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"BLVfnzDLhh"}],"key":"biOMeo7YOt"},{"type":"text","value":", then the only point where this happens is at the ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"toETehZtve"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"global optimum.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"dhnAzg3beg"}],"key":"JFnfDdvHZt"},{"type":"text","value":"\nOtherwise, if ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"a69xNgX3bH"},{"type":"inlineMath","value":"J","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"JJJ","key":"NELKrLfsSG"},{"type":"text","value":" is nonconvex, the best we can hope for is a ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"IltAOB2pEv"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"local optimum.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"l8LsxyqWFE"}],"key":"e9a2KrbpMT"}],"key":"w1KMg0g0UC"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"WAQ5lVSx6K"}],"key":"wmgMLfGEIR"},{"type":"paragraph","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"children":[{"type":"text","value":"How does a computer compute the gradient of a function?","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"sJAMUgPRTV"}],"key":"Jr9WtoFj2c"},{"type":"paragraph","position":{"start":{"line":141,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"One way is ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"vauNfvBuYV"},{"type":"emphasis","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"symbolic differentiation,","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"b1j619m1EW"}],"key":"a6DMIKs89c"},{"type":"text","value":"\nwhich is similar to the way you might compute it by hand:\nthe computer applies a list of rules to transform the ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"RxlsOO3qU1"},{"type":"emphasis","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"ntDIgS7g0Q"}],"key":"JaJ4kigtnI"},{"type":"text","value":" involved.\nPython’s ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"HDXlt3zO2o"},{"type":"inlineCode","value":"sympy","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"PSARloLQVh"},{"type":"text","value":" package supports symbolic differentiation.\nHowever, functions implemented in code may not always have a straightforward symbolic representation.","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"Dizf0pTYGH"}],"key":"eksExAXGHN"},{"type":"paragraph","position":{"start":{"line":147,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"Another way is ","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"bBU2emTmVC"},{"type":"emphasis","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"numerical differentiation,","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"mLix3YvwaI"}],"key":"woyEr7fX6b"},{"type":"text","value":"\nwhich is based on the limit definition of a (directional) derivative:","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"RhFLouQqhJ"}],"key":"Byj69QeMlM"},{"type":"math","value":"\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}","position":{"start":{"line":150,"column":1},"end":{"line":153,"column":1}},"html":"uJ(x)=limε0J(x+εu)J(x)ε\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}uJ(x)=ε0limεJ(x+εu)J(x)","enumerator":"6.3","key":"v7Y3DNbeOl"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"text","value":"Then, we can substitute a small value of ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"XojZYYgZA6"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"ε\\varepsilonε","key":"LS3luvf6SB"},{"type":"text","value":" on the r.h.s. to approximate the directional derivative.\nHow small, though? If we need an accurate estimate,\nwe may need such a small value of ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"F05vJwOCHD"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"ε\\varepsilonε","key":"kHN4nK5sDh"},{"type":"text","value":" that typical computers will run into rounding errors.\nAlso, to compute the full gradient,\nwe would need to compute the r.h.s. once for each input dimension.\nThis is an issue if computing ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"rpvJ2TeNca"},{"type":"inlineMath","value":"J","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"JJJ","key":"H2dxuMRsDC"},{"type":"text","value":" is expensive.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"GAXV2CRKt5"}],"key":"Nwc4buVDUd"},{"type":"paragraph","position":{"start":{"line":162,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"strong","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"Automatic differentiation","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"OkkgdhVffS"}],"key":"W9Em3SyPXX"},{"type":"text","value":" achieves the best of both worlds.\nLike symbolic differentiation,\nwe manually implement the derivative rules for a few basic operations.\nHowever, instead of executing these on the ","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"PE5xfOH3aZ"},{"type":"emphasis","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"jWvmiUVKrH"}],"key":"NTY6qSvYKT"},{"type":"text","value":",\nwe execute them on the ","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"ytfxJE0z1o"},{"type":"emphasis","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"values","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"RxN40tgCHf"}],"key":"bSjzLnrwV2"},{"type":"text","value":" when the function gets called,\nlike in numerical differentiation.\nThis allows us to differentiate through programming constructs such as branches or loops,\nand doesn’t involve any arbitrarily small values.","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"tNvG9TClbN"}],"key":"FeWXAgKdwa"}],"key":"c6auZoFMYA"}],"key":"ICh8m1PKbb"},{"type":"block","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Stochastic gradient ascent","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"C2DocgyfJr"}],"identifier":"stochastic-gradient-ascent","label":"Stochastic gradient ascent","html_id":"stochastic-gradient-ascent","implicit":true,"enumerator":"6.2.1","key":"H1ZQWTnogb"},{"type":"paragraph","position":{"start":{"line":176,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"In real applications,\ncomputing the gradient of the target function is not so simple.\nAs an example from supervised learning, ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"QEKbGTZ90F"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"f59xmXWh0f"},{"type":"text","value":" might be the sum of squared prediction errors across an entire training dataset.\nHowever, if our dataset is very large, it might not fit into our computer’s memory!\nIn these cases, we often compute some ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"M8cgemzsXU"},{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"estimate","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"ZAesgBEMOs"}],"key":"LFjwVILabc"},{"type":"text","value":" of the gradient at each step, ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"hSIy5moO9P"},{"type":"inlineMath","value":"\\tilde \\nabla J(\\theta)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"html":"~J(θ)\\tilde \\nabla J(\\theta)~J(θ)","key":"FziwOaiUK6"},{"type":"text","value":", and walk in that direction instead.\nThis is called ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"HoOYe4QzqS"},{"type":"strong","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"DW9lCsGpQk"}],"key":"afKFeHNPSX"},{"type":"text","value":" gradient ascent.\nIn the SL example above, we might randomly choose a ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"elvv57vb1I"},{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"N1soQqLhIi"}],"key":"b7WrtclcnA"},{"type":"text","value":" of samples and use them to estimate the true prediction error. (This approach is known as ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"PijtCLnskw"},{"type":"strong","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"ROgJxktYFo"}],"key":"KyV0FzX2hi"},{"type":"text","value":" SGD","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"v2WtCOj6ey"}],"key":"vCgCCUeRb3"},{"type":"text","value":".)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"hGfS67AShk"}],"key":"NSawrUOCpx"}],"key":"ffWzfQiDg2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def sgd(\n θ_init: Array,\n estimate_gradient: Callable[[Array], Array],\n η: float,\n n_steps: int,\n):\n \"\"\"Perform `n_steps` steps of SGD.\n\n `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters.\n \"\"\"\n θ = θ_init\n for step in range(n_steps):\n θ += η * estimate_gradient(θ)\n return θ","key":"iHZWvbD6uc"},{"type":"output","id":"QLP7QRmVGLJr60aFiETa4","data":[],"key":"Peq0hnExae"}],"data":{},"key":"xNK67nXSsi"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":201,"column":1},"end":{"line":202,"column":1}},"children":[{"type":"text","value":"What makes one gradient estimator better than another?\nIdeally, we want this estimator to be ","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"UJVvGHeT8D"},{"type":"strong","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"unbiased;","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"QN6I1Ci3cK"}],"key":"te6F6GbhiX"},{"type":"text","value":" that is, on average, it matches a single true gradient step:","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"Dlakv5ZVUB"}],"key":"BWPC6Su8yJ"},{"type":"math","value":"\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"html":"E[~J(θ)]=J(θ).\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).E[~J(θ)]=J(θ).","enumerator":"6.4","key":"X1GxVjYEod"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"We also want the ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"nXFXZ7FpZ5"},{"type":"emphasis","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"oYHApY27KT"}],"key":"ZJxDSRzYa2"},{"type":"text","value":" of the estimator to be low so that its performance doesn’t change drastically at each step.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"a6bcHK1kXr"}],"key":"qsWZB81Rhg"},{"type":"paragraph","position":{"start":{"line":210,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"uDk9va3iRK"},{"type":"text","value":"θ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"tiTXkAwJhd"},{"type":"text","value":" that is “close” to a stationary point.\nIn another perspective, for such functions, the local “landscape” of ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"qwIX5pYaMZ"},{"type":"inlineMath","value":"J","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"html":"JJJ","key":"cR6UlVYDRB"},{"type":"text","value":" around ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"gvLptkIpwd"},{"type":"text","value":"θ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"gKYWVP0FGp"},{"type":"text","value":" becomes flatter and flatter the longer we run SGD.","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"xfRi0mm6J8"}],"key":"PCLThRrNYN"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"SGD convergence","position":{"start":{"line":213,"column":1},"end":{"line":213,"column":1}},"key":"ThJzawORqE"}],"key":"S9igq42j13"},{"type":"paragraph","position":{"start":{"line":214,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"More formally, suppose we run SGD for ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"U2OYwS2R7O"},{"type":"inlineMath","value":"K","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"KKK","key":"nyP42lCSfq"},{"type":"text","value":" steps, using an unbiased gradient estimator.\nLet the step size ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"hFsEI0zBhy"},{"type":"inlineMath","value":"\\eta^k","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"ηk\\eta^kηk","key":"XM2nNCGs2T"},{"type":"text","value":" scale as ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"BZbb7i1ilp"},{"type":"inlineMath","value":"O(1/\\sqrt{k}).","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"O(1/k).O(1/\\sqrt{k}).O(1/k).","key":"EgSgBwvnRg"},{"type":"text","value":"\nThen if ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"VUIvrFT27r"},{"type":"inlineMath","value":"J","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"JJJ","key":"VJo89leF7Z"},{"type":"text","value":" is bounded and ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"bfQYkpIPRY"},{"type":"text","value":"β","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"jHLke2SGjw"},{"type":"text","value":"-smooth (see below),\nand the ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"bE81MyliTl"},{"type":"emphasis","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"text","value":"norm","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"WkRnaNuBHr"}],"key":"ZeHH6ttHzW"},{"type":"text","value":" of the gradient estimator has a bounded second moment ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"m8Saxklx2y"},{"type":"inlineMath","value":"\\sigma^2,","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"σ2,\\sigma^2,σ2,","key":"tKCQrmZh5p"}],"key":"l3FDPIt2yT"},{"type":"math","value":"\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"html":"J(θK)2O(Mβσ2/K).\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).∥∇J(θK)2O(σ2/K).","enumerator":"6.5","key":"lWsCbWfT5e"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"children":[{"type":"text","value":"We call a function ","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"Nl8ZDLPrDL"},{"type":"text","value":"β","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"taW4b93zgg"},{"type":"text","value":"-smooth if its gradient is Lipschitz continuous with constant ","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"vxon3K8TX8"},{"type":"text","value":"β","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"ykDP4BBxxQ"},{"type":"text","value":":","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"DEQjZIiZu0"}],"key":"rbhVHntpty"},{"type":"math","value":"\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"J(θ)J(θ)βθθ.\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.∥∇J(θ)J(θ)βθθ∥.","enumerator":"6.6","key":"ul7NK6Xvv6"}],"key":"U4Ofu2oZhz"},{"type":"paragraph","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"We’ll now see a concrete application of gradient ascent in the context of policy optimization.","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"YbswJ169EP"}],"key":"ujyVqDSTNN"}],"key":"bk9aAxId2Q"},{"type":"block","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Policy (stochastic) gradient ascent","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"KO7cnqyAsE"}],"identifier":"policy-stochastic-gradient-ascent","label":"Policy (stochastic) gradient ascent","html_id":"policy-stochastic-gradient-ascent","implicit":true,"enumerator":"6.3","key":"Px4e1Ateps"},{"type":"paragraph","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"Remember that in RL, the primary goal is to find the ","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"jF5dtyghve"},{"type":"emphasis","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"gSSI0YGBi5"}],"key":"jJ0QphFhRT"},{"type":"text","value":" that achieves the maximimum total reward, which we can express using the value function we defined in ","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"Uvz8RyXBbt"},{"type":"crossReference","kind":"proof:definition","identifier":"value","label":"value","children":[{"type":"text","value":"Definition ","key":"wU7JUmGgSH"},{"type":"text","value":"1.6","key":"NjfXtJJDRQ"}],"template":"Definition %s","enumerator":"1.6","resolved":true,"html_id":"value","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"tZnwRHEo25"},{"type":"text","value":":","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"WI6SvY0pGq"}],"key":"bAwXeCJKNq"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E \\sum_{\\hi=0}^{\\hor-1} r_\\hi \\\\\n \\text{where} \\quad & s_0 \\sim \\mu_0 \\\\\n & s_{t+1} \\sim P(s_\\hi, a_\\hi), \\\\\n & a_\\hi = \\pi(s_\\hi) \\\\\n & r_\\hi = r(s_\\hi, a_\\hi).\n\\end{aligned}","label":"objective_fn","identifier":"objective_fn","html":"J(π):=Es0μ0Vπ(s0)=Eh=0H1rhwheres0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E \\sum_{\\hi=0}^{\\hor-1} r_\\hi \\\\\n \\text{where} \\quad & s_0 \\sim \\mu_0 \\\\\n & s_{t+1} \\sim P(s_\\hi, a_\\hi), \\\\\n & a_\\hi = \\pi(s_\\hi) \\\\\n & r_\\hi = r(s_\\hi, a_\\hi).\n\\end{aligned}J(π):=Es0μ0Vπ(s0)=whereEh=0H1rhs0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).","enumerator":"6.7","html_id":"objective-fn","key":"lX401GWI87"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"(Note that we’ll continue to work in the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"JrwlKKO35x"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"undiscounted, finite-horizon case.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"F4em7d1bbA"}],"key":"WgbUWVdJzD"},{"type":"text","value":" Analogous results hold for the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"v4o5GNzo8l"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"discounted, infinite-horizon case.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"Wm8QuBnf3b"}],"key":"aIDun9vKBf"},{"type":"text","value":")","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"eDf3ZpOVDX"}],"key":"eJgbJpxtmQ"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":251,"column":1}},"children":[{"type":"text","value":"As shown by the notation, this is exactly the function ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"cLfR3xouZx"},{"type":"inlineMath","value":"J","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"JJJ","key":"vUwn2bFfOR"},{"type":"text","value":" that we want to maximize using gradient ascent.\nWhat does ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"wWYV8cVpyd"},{"type":"text","value":"θ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"vWtIcy61US"},{"type":"text","value":" correspond to, though?\nIn general, ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"VCeYEMk4jQ"},{"type":"text","value":"π","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"BT6i4uMHCJ"},{"type":"text","value":" is a function, and optimizing over the space of arbitrary input-output mappings would be intractable.\nInstead, we need to describe ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"kkZtUqPAwB"},{"type":"text","value":"π","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"uo989In5Ny"},{"type":"text","value":" in terms of some finite set of ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"shdUHvO6Z7"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"parameters","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"kxFeRgET1y"}],"key":"Hs0Yu1F3gF"},{"type":"text","value":" ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"nL3dIEbqOm"},{"type":"text","value":"θ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"ZQ8OVFJqaZ"},{"type":"text","value":".","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"SGB064jxhY"}],"key":"f7pqNoon5K"}],"key":"mFdHie90xa"},{"type":"block","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Example policy parameterizations","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"WYXmimdfi1"}],"label":"parameterizations","identifier":"parameterizations","html_id":"parameterizations","enumerator":"6.3.1","key":"OXyteNSZ1X"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"What are some ways we could parameterize our policy?","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"pLEhc398VS"}],"key":"W2R5HrgTFA"}],"key":"k4J4blViwc"},{"type":"block","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"children":[{"type":"text","value":"Tabular representation","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"EzbE382tsJ"}],"identifier":"tabular-representation","label":"Tabular representation","html_id":"tabular-representation","implicit":true,"enumerator":"6.3.1.1","key":"el3C6N3QA0"},{"type":"paragraph","position":{"start":{"line":264,"column":1},"end":{"line":267,"column":1}},"children":[{"type":"text","value":"If both the state and action spaces are finite, perhaps we could simply learn a preference value ","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"DXrVY0EbUU"},{"type":"inlineMath","value":"\\theta_{s,a}","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"html":"θs,a\\theta_{s,a}θs,a","key":"dulnHG3JQg"},{"type":"text","value":" for each state-action pair.\nThen to turn this into a valid distribution, we perform a ","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"hPlRPNbp0L"},{"type":"strong","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"children":[{"type":"text","value":"softmax","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"lOxFCn1qtt"}],"key":"WS0tSsb2Gx"},{"type":"text","value":" operation:\nwe exponentiate each of them,\nand then normalize to form a valid distribution:","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"eacoA79BSg"}],"key":"myA9l8icgs"},{"type":"math","value":"\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":"πθsoftmax(as)=exp(θs,a)s,aexp(θs,a).\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.πθsoftmax(as)=s,aexp(θs,a)exp(θs,a).","enumerator":"6.8","key":"zDBUCQvVrp"},{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"However, this doesn’t make use of any structure in the states or actions,\nso while this is flexible, it is also prone to overfitting.","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"iHZND4z6E6"}],"key":"E760LE2ESB"},{"type":"heading","depth":4,"position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Linear in features","position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"key":"Sv8JtjqzjY"}],"identifier":"linear-in-features","label":"Linear in features","html_id":"linear-in-features","implicit":true,"enumerator":"6.3.1.2","key":"wz5TjksTwn"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"Another approach is to map each state-action pair into some ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"CMndVQ7Gx9"},{"type":"strong","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"feature space","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"pwmwzaVXML"}],"key":"hmUB8I6n0T"},{"type":"text","value":" ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"VJpwhtCHns"},{"type":"inlineMath","value":"\\phi(s, a) \\in \\mathbb{R}^p","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"ϕ(s,a)Rp\\phi(s, a) \\in \\mathbb{R}^pϕ(s,a)Rp","key":"rtG51oe5YM"},{"type":"text","value":". Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"Ev5g5mNAv5"}],"key":"smPTNb5psg"},{"type":"math","value":"\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"html":"πθlinear in features(as)=exp(θϕ(s,a))aexp(θϕ(s,a)).\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.πθlinear in features(as)=aexp(θϕ(s,a))exp(θϕ(s,a)).","enumerator":"6.9","key":"X1xOFaFALD"},{"type":"paragraph","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"children":[{"type":"text","value":"Another interpretation is that ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"tgZnfiFhNR"},{"type":"text","value":"θ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"b2PCtrys4P"},{"type":"text","value":" represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"RcC5zYPMJ9"},{"type":"text","value":"θ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"E2MtrJ2qZO"},{"type":"text","value":" are given higher probability.","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"JWNlvtdyWf"}],"key":"drJs6rthcH"},{"type":"paragraph","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"children":[{"type":"text","value":"The score function for this parameterization is also quite elegant:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"r2YYRyHZHi"}],"key":"IliQ7mGW81"},{"type":"math","value":"\\begin{aligned}\n \\nabla \\log \\pi_\\theta(a|s) &= \\nabla \\left( \\theta^\\top \\phi(s, a) - \\log \\left( \\sum_{a'} \\exp(\\theta^\\top \\phi(s, a')) \\right) \\right) \\\\\n &= \\phi(s, a) - \\E_{a' \\sim \\pi_\\theta(s)} \\phi(s, a')\n\\end{aligned}","position":{"start":{"line":284,"column":1},"end":{"line":289,"column":1}},"html":"logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)\\begin{aligned}\n \\nabla \\log \\pi_\\theta(a|s) &= \\nabla \\left( \\theta^\\top \\phi(s, a) - \\log \\left( \\sum_{a'} \\exp(\\theta^\\top \\phi(s, a')) \\right) \\right) \\\\\n &= \\phi(s, a) - \\E_{a' \\sim \\pi_\\theta(s)} \\phi(s, a')\n\\end{aligned}logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)","enumerator":"6.10","key":"Clfem0a8o3"},{"type":"paragraph","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Plugging this into our policy gradient expression, we get","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"NROZGnladF"}],"key":"dTrsCqQ2Vg"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A_\\hi^{\\pi_\\theta}\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\left( \\phi(s_\\hi, a_\\hi) - \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s_\\hi, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi)\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\phi(s_\\hi, a_\\hi) A_\\hi^{\\pi_\\theta} (s_\\hi, a_\\hi) \\right]\n\\end{aligned}","position":{"start":{"line":293,"column":1},"end":{"line":302,"column":1}},"html":"J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]\\begin{aligned}\n \\nabla J(\\theta) & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A_\\hi^{\\pi_\\theta}\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\left( \\phi(s_\\hi, a_\\hi) - \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s_\\hi, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi)\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\phi(s_\\hi, a_\\hi) A_\\hi^{\\pi_\\theta} (s_\\hi, a_\\hi) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]","enumerator":"6.11","key":"lQGPEndM9W"},{"type":"paragraph","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"children":[{"type":"text","value":"Why can we drop the ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"hGsACzwnJF"},{"type":"inlineMath","value":"\\E \\phi(s_\\hi, a')","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eϕ(sh,a)\\E \\phi(s_\\hi, a')Eϕ(sh,a)","key":"vt5w50UBqZ"},{"type":"text","value":" term? By linearity of expectation, consider the dropped term at a single timestep: ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"SbcjEHjW73"},{"type":"inlineMath","value":"\\E_{\\tau \\sim \\rho_\\theta} \\left[ \\left( \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi) \\right].","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].\\E_{\\tau \\sim \\rho_\\theta} \\left[ \\left( \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi) \\right].Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].","key":"X4hcAVMuiq"},{"type":"text","value":" By Adam’s Law, we can wrap the advantage term in a conditional expectation on the state ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"MCZQmv8zvw"},{"type":"inlineMath","value":"s_\\hi.","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"sh.s_\\hi.sh.","key":"tU1saaG9zI"},{"type":"text","value":" Then we already know that ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"DaVgS4ll18"},{"type":"inlineMath","value":"\\E_{a \\sim \\pi(s)} A_\\hi^{\\pi}(s, a) = 0,","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eaπ(s)Ahπ(s,a)=0,\\E_{a \\sim \\pi(s)} A_\\hi^{\\pi}(s, a) = 0,Eaπ(s)Ahπ(s,a)=0,","key":"JIITrfrTQl"},{"type":"text","value":" and so this entire term vanishes.","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"od9TQQ8oHv"}],"key":"M6vbOH3vDw"},{"type":"heading","depth":4,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Neural policies","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"IQvjHw5eEG"}],"identifier":"neural-policies","label":"Neural policies","html_id":"neural-policies","implicit":true,"enumerator":"6.3.1.3","key":"TKchd0qQ25"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"More generally, we could map states and actions to unnormalized scores via some parameterized function ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"S5yC7Cpgf7"},{"type":"inlineMath","value":"f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"fθ:S×AR,f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},fθ:S×AR,","key":"opxyKoXCvH"},{"type":"text","value":" such as a neural network, and choose actions according to a softmax: ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"x8z7DHtHf9"}],"key":"gyFuvtJy0S"},{"type":"math","value":"\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"tight":"before","html":"πθgeneral(as)=exp(fθ(s,a))aexp(fθ(s,a)).\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.πθgeneral(as)=aexp(fθ(s,a))exp(fθ(s,a)).","enumerator":"6.12","key":"iPceJOQAdR"},{"type":"paragraph","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"children":[{"type":"text","value":"The score can then be written as ","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"Ti1IaaQzty"}],"key":"SuRHqzGuYK"},{"type":"math","value":"\\nabla \\log \\pi_\\theta(a|s) = \\nabla f_\\theta(s, a) - \\E_{a \\sim \\pi_\\theta(s)} \\nabla f_\\theta (s, a')","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"tight":"before","html":"logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)\\nabla \\log \\pi_\\theta(a|s) = \\nabla f_\\theta(s, a) - \\E_{a \\sim \\pi_\\theta(s)} \\nabla f_\\theta (s, a')logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)","enumerator":"6.13","key":"wvAV2wgBhd"}],"key":"Y7UFsj8IPw"},{"type":"block","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"children":[{"type":"text","value":"Continuous action spaces","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"tIm9p2EuFA"}],"identifier":"continuous-action-spaces","label":"Continuous action spaces","html_id":"continuous-action-spaces","implicit":true,"enumerator":"6.3.2","key":"jjKNdcnrwT"},{"type":"paragraph","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Consider a continuous ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"sjg0nIZswR"},{"type":"inlineMath","value":"n","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"nnn","key":"kQO183Mpdr"},{"type":"text","value":"-dimensional action space ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"dfXJpAzAR4"},{"type":"inlineMath","value":"\\mathcal{A} = \\mathbb{R}^n","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"A=Rn\\mathcal{A} = \\mathbb{R}^nA=Rn","key":"GXnnsThyr1"},{"type":"text","value":". Then for a stochastic policy, we could use a function to predict the ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"nwL4WO1GcT"},{"type":"emphasis","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"mean","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"M7p9QCNKeI"}],"key":"V9Cw37tWxw"},{"type":"text","value":" action and then add some random noise about it. For example, we could use a neural network to predict the mean action ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"nXbUVrmwXH"},{"type":"inlineMath","value":"\\mu_\\theta(s)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"μθ(s)\\mu_\\theta(s)μθ(s)","key":"zJvYE6Ua4d"},{"type":"text","value":" and then add some noise ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"kBmQvjgCoD"},{"type":"inlineMath","value":"\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"ϵN(0,σ2I)\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)ϵN(0,σ2I)","key":"tGXxK1PVKd"},{"type":"text","value":" to it:","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"zYVGOLCHUz"}],"key":"OYfLJdHYBo"},{"type":"math","value":"\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"html":"πθ(as)=N(μθ(s),σ2I).\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).πθ(as)=N(μθ(s),σ2I).","enumerator":"6.14","key":"dA2iXqVBEW"},{"type":"comment","value":" **Exercise:** Can you extend the \"linear in features\" policy to continuous action spaces in a similar way? ","key":"NdNMvQ0MKa"}],"key":"bysKcVCze7"},{"type":"block","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"children":[{"type":"text","value":"Now that we have seen parameterized policies, we can now write the total reward in terms of the parameters:","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"NaowCUzr0N"}],"key":"es7DyC88Y6"},{"type":"math","value":"J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau).","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"html":"J(θ)=EτρθR(τ).J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau).J(θ)=EτρθR(τ).","enumerator":"6.15","key":"tMO7yoHMCH"},{"type":"paragraph","position":{"start":{"line":328,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"text","value":"Now how do we maximize this function (the expected total reward) over the parameters?\nOne simple idea would be to directly apply gradient ascent:","position":{"start":{"line":328,"column":1},"end":{"line":328,"column":1}},"key":"eSMj32b2fA"}],"key":"facxYr11oM"},{"type":"math","value":"\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).","position":{"start":{"line":331,"column":1},"end":{"line":333,"column":1}},"html":"θk+1=θk+ηJ(θk).\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).θk+1=θk+ηJ(θk).","enumerator":"6.16","key":"qnScrzHvzt"},{"type":"paragraph","position":{"start":{"line":335,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"In order to apply this technique, we need to be able to evaluate the gradient ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"SbgOhqSIwh"},{"type":"inlineMath","value":"\\nabla J(\\theta).","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"J(θ).\\nabla J(\\theta).J(θ).","key":"AAIfJDVxSo"},{"type":"text","value":"\nBut ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"IDrtgUUnZ5"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"QlwirdSvuQ"},{"type":"text","value":" is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"pMxkSqUJ6R"},{"type":"inlineMath","value":"\\tau.","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"τ.\\tau.τ.","key":"smB9n8Xj6z"},{"type":"text","value":"\nCan we rewrite it in a form that’s more convenient to implement?","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"LRxPUgIjry"}],"key":"kWB8qcPzBO"}],"key":"o6mg3cCtRL"},{"type":"block","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":342,"column":1},"end":{"line":342,"column":1}},"children":[{"type":"text","value":"Importance Sampling","position":{"start":{"line":342,"column":1},"end":{"line":342,"column":1}},"key":"vFUGNsMpNY"}],"label":"importance_sampling","identifier":"importance_sampling","html_id":"importance-sampling","enumerator":"6.3.3","key":"DPS8BjFgeK"},{"type":"paragraph","position":{"start":{"line":344,"column":1},"end":{"line":352,"column":1}},"children":[{"type":"text","value":"There is a general trick called ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"LYFdxPuFDq"},{"type":"strong","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"importance sampling","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Lx2zerr4tp"}],"key":"TBbyEiILMW"},{"type":"text","value":" for evaluating such expectations.\nSuppose we want to estimate ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"zV5i0qBMR2"},{"type":"inlineMath","value":"\\E_{x \\sim p}[f(x)]","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"Exp[f(x)]\\E_{x \\sim p}[f(x)]Exp[f(x)]","key":"hiiAScFSXO"},{"type":"text","value":" where ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"NcFaF9hHCE"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"ZWQdo8lRvr"},{"type":"text","value":" is hard or expensive to sample from. We can, however, evaluate the likelihood ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"HoPtb0C7d1"},{"type":"inlineMath","value":"p(x)","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"p(x)p(x)p(x)","key":"Z8UxnMdP9w"},{"type":"text","value":".\nSuppose that we ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"nHGaYn8POs"},{"type":"emphasis","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"can","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"yXeuyZOaus"}],"key":"Fkqfp3LUKc"},{"type":"text","value":" sample from a different distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"TWqmGWm4rp"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"cNtK3obJjG"},{"type":"text","value":".\nSince an expectation is just a weighted average, we can sample ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"ONehSW3Fpf"},{"type":"inlineMath","value":"x","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"xxx","key":"xX3xYlE8R0"},{"type":"text","value":" from ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"S3Idr9O04q"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"UJgSjdSG7o"},{"type":"text","value":", compute ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"XlpsPSibQt"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"f(x)f(x)f(x)","key":"ilWyHuiGIL"},{"type":"text","value":", and then reweight the results:\nif ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Bi3mdJQ9t4"},{"type":"inlineMath","value":"x","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"xxx","key":"qaWjU9yIAw"},{"type":"text","value":" is very likely under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"UM2SY5t85o"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"HWi456KQ3U"},{"type":"text","value":" but unlikely under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"mxQF9ebdj6"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"IVNcgr1Sdr"},{"type":"text","value":",\nwe should boost its weighting,\nand if it is common under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Q95qXrd0qe"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"JgfQVBCa3C"},{"type":"text","value":" but uncommon under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"UqzGPApyV4"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"n43A772Pfx"},{"type":"text","value":",\nwe should lower its weighting.\nThe reweighting factor is exactly the ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"DodmfQgD5u"},{"type":"strong","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"likelihood ratio","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"NnntDA1tXN"}],"key":"aagyGj4e5b"},{"type":"text","value":" between the target distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"KkTjuk5TBV"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"HMOsH8rWLi"},{"type":"text","value":" and the sampling distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"FhAHydqzhd"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"mnoWLa0VWa"},{"type":"text","value":":","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"LIlqTzDdIC"}],"key":"xlgsrNIDh7"},{"type":"math","value":"\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].","position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"html":"Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].Exp[f(x)]=xXf(x)p(x)=xXf(x)q(x)p(x)q(x)=Exq[q(x)p(x)f(x)].","enumerator":"6.17","key":"JLkS2ONsNl"},{"type":"paragraph","position":{"start":{"line":358,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"eZv4JSLKCF"},{"type":"emphasis","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"zcC9TORNWt"}],"key":"U1TTu6z1Hk"},{"type":"text","value":" expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term.\nIf there are values of ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"EXTQydXzXq"},{"type":"inlineMath","value":"x","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"xxx","key":"YoprSMTEx3"},{"type":"text","value":" that are very rare in the sampling distribution ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"cAQInUqxB7"},{"type":"inlineMath","value":"q","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"qqq","key":"M31wj3rXWE"},{"type":"text","value":",\nbut common under ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"DPEGV8rGEX"},{"type":"inlineMath","value":"p","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"ppp","key":"IAYRUH1B79"},{"type":"text","value":",\nthen the likelihood ratio ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"XyuSBaEXzw"},{"type":"inlineMath","value":"p(x)/q(x)","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"p(x)/q(x)p(x)/q(x)p(x)/q(x)","key":"x4SkGyeS8G"},{"type":"text","value":" will cause the variance to blow up.","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"vC8rlmvtCn"}],"key":"USIyV4XnGc"},{"type":"heading","depth":2,"position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"children":[{"type":"text","value":"The REINFORCE policy gradient","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"ZAFKIywZkV"}],"identifier":"the-reinforce-policy-gradient","label":"The REINFORCE policy gradient","html_id":"the-reinforce-policy-gradient","implicit":true,"enumerator":"6.4","key":"krzXMX61LJ"},{"type":"paragraph","position":{"start":{"line":365,"column":1},"end":{"line":367,"column":1}},"children":[{"type":"text","value":"Returning to RL, suppose there is some trajectory distribution ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"sqQQgsvGEf"},{"type":"inlineMath","value":"\\rho(\\tau)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"ρ(τ)\\rho(\\tau)ρ(τ)","key":"aA2c9yq5mv"},{"type":"text","value":" that is ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"DqgCEB2zQL"},{"type":"strong","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"easy to sample from,","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"oMAXFtFKxQ"}],"key":"jGIKkLLbV1"},{"type":"text","value":" such as a database of existing trajectories.\nWe can then rewrite ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"fEnbAL69yZ"},{"type":"inlineMath","value":"\\nabla J(\\theta)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"J(θ)\\nabla J(\\theta)J(θ)","key":"fUb0KS4uFU"},{"type":"text","value":", a.k.a. the ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"de3chFFjRf"},{"type":"emphasis","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"policy gradient","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"UHG7CGdkoK"}],"key":"LV2Ozt6RxQ"},{"type":"text","value":", as follows.\nAll gradients are being taken with respect to ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"RGxPisZqWi"},{"type":"text","value":"θ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"gkreXkZxar"},{"type":"text","value":".","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"ZOFQYiuhuK"}],"key":"asSenUe8TF"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}","position":{"start":{"line":369,"column":1},"end":{"line":375,"column":1}},"html":"J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}J(θ)=Eτρθ[R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]likelihood ratio trickswitching gradient and expectation","enumerator":"6.18","key":"wYW6pspzBt"},{"type":"paragraph","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"children":[{"type":"text","value":"Note that for ","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"zJMPEgHjeA"},{"type":"inlineMath","value":"\\rho = \\rho_\\theta","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"html":"ρ=ρθ\\rho = \\rho_\\thetaρ=ρθ","key":"UhKCuI0EDM"},{"type":"text","value":", the inside term becomes","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"Ng8YYvmqBg"}],"key":"Y2Y1y7czls"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].","position":{"start":{"line":379,"column":1},"end":{"line":381,"column":1}},"html":"J(θ)=Eτρθ[logρθ(τ)R(τ)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].J(θ)=Eτρθ[logρθ(τ)R(τ)].","enumerator":"6.19","key":"tYmSZKEUfy"},{"type":"paragraph","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"(The order of operations is ","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"koAZbbaKNT"},{"type":"inlineMath","value":"\\nabla (\\log \\rho_\\theta)(\\tau)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"html":"(logρθ)(τ)\\nabla (\\log \\rho_\\theta)(\\tau)(logρθ)(τ)","key":"wv5oaSTE1h"},{"type":"text","value":".)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"lN5XBzBw0p"}],"key":"x1SrkTvZxI"},{"type":"paragraph","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"Note that when the state transitions are Markov (i.e. ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"oKC6xaJEgR"},{"type":"inlineMath","value":"s_{t}","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"sts_{t}st","key":"cxrhy1Nh4j"},{"type":"text","value":" only depends on ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"yJZqdWdl5R"},{"type":"inlineMath","value":"s_{t-1}, a_{t-1}","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"st1,at1s_{t-1}, a_{t-1}st1,at1","key":"AB6hzheZd5"},{"type":"text","value":") and the policy is time-homogeneous (i.e. ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"zrxOtvYvO6"},{"type":"inlineMath","value":"a_\\hi \\sim \\pi_\\theta (s_\\hi)","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"ahπθ(sh)a_\\hi \\sim \\pi_\\theta (s_\\hi)ahπθ(sh)","key":"k170kNB2qd"},{"type":"text","value":"), we can write out the ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"TfGrj3hGc1"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"likelihood of a trajectory","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"gFAyENtoww"}],"key":"n9sQCfYXwh"},{"type":"text","value":" under the policy ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"LPflwrneqi"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"TNoV1Mb0dx"},{"type":"text","value":":","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"muIOm97zga"}],"key":"SPRp2JSDu5"},{"type":"math","value":"\\begin{aligned}\n \\rho_\\theta(\\tau) &= \\mu(s_0) \\pi_\\theta(a_0 | s_0) \\\\\n &\\qquad \\times P(s_1 | s_0, a_0) \\pi_\\theta(a_1 | s_1) \\\\\n &\\qquad \\times \\cdots \\\\\n &\\qquad \\times P(s_{H-1} | s_{H-2}, a_{H-2}) \\pi_\\theta(a_{H-1} | s_{H-1}).\n\\end{aligned}","label":"trajectory_likelihood","identifier":"trajectory_likelihood","html":"ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).\\begin{aligned}\n \\rho_\\theta(\\tau) &= \\mu(s_0) \\pi_\\theta(a_0 | s_0) \\\\\n &\\qquad \\times P(s_1 | s_0, a_0) \\pi_\\theta(a_1 | s_1) \\\\\n &\\qquad \\times \\cdots \\\\\n &\\qquad \\times P(s_{H-1} | s_{H-2}, a_{H-2}) \\pi_\\theta(a_{H-1} | s_{H-1}).\n\\end{aligned}ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).","enumerator":"6.20","html_id":"trajectory-likelihood","key":"CRJXuqTkNM"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"Note that the log-trajectory-likelihood turns into a sum of terms,\nof which only the ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"rIaT4NMKqw"},{"type":"inlineMath","value":"\\pi_\\theta(a_\\hi | s_\\hi)","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"πθ(ahsh)\\pi_\\theta(a_\\hi | s_\\hi)πθ(ahsh)","key":"lAJ4L9kkpa"},{"type":"text","value":" terms depend on ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"lVuW1wCPpe"},{"type":"inlineMath","value":"\\theta,","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"θ,\\theta,θ,","key":"mj7rbY9I4C"},{"type":"text","value":"\nso we can simplify even further to obtain the following expression for the policy gradient, known as the “REINFORCE” policy gradient:","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"ROrk5iOhgg"}],"key":"huLmhMCRxT"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}","label":"reinforce_pg","identifier":"reinforce_pg","html":"J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]","enumerator":"6.21","html_id":"reinforce-pg","key":"hdME1CdlzB"},{"type":"paragraph","position":{"start":{"line":410,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"This expression allows us to estimate the gradient by sampling a few sample trajectories from ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"Kj3PHp2nDA"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"ApFrZw5AZd"},{"type":"text","value":"\ncalculating the likelihoods of the chosen actions,\nand substituting these into the expression above.\nWe can then use this gradient estimate to apply stochastic gradient ascent.","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"q7yXhdCY0e"}],"key":"oTaeyuXuqb"},{"type":"code","lang":"python","value":"def estimate_gradient_reinforce_pseudocode(env, π, θ):\n τ = sample_trajectory(env, π(θ))\n gradient_hat = 0\n for s, a, r in τ:\n def policy_log_likelihood(θ):\n return log(π(θ)(s, a))\n gradient_hat += jax.grad(policy_log_likelihood)(θ) * τ.total_reward\n return gradient_hat","position":{"start":{"line":415,"column":1},"end":{"line":424,"column":1}},"key":"cBhIWJ8gRb"},{"type":"paragraph","position":{"start":{"line":426,"column":1},"end":{"line":429,"column":1}},"children":[{"type":"text","value":"In fact, we can perform one more simplification.\nIntuitively, the action taken at step ","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"sZHGosFMRf"},{"type":"inlineMath","value":"t","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"html":"ttt","key":"wbUPy338b1"},{"type":"text","value":" does not affect the reward from previous timesteps, since they’re already in the past!\nYou can also show rigorously that this is the case,\nand that we only need to consider the present and future rewards to calculate the policy gradient:","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"ymk5iStj34"}],"key":"Gpd3i2XfOB"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \\right] \\\\\n &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{t}, a_{t}) \\right]\n\\end{aligned}","label":"pg_with_q","identifier":"pg_with_q","html":"J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]\\begin{aligned}\n \\nabla J(\\theta) &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \\right] \\\\\n &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{t}, a_{t}) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]","enumerator":"6.22","html_id":"pg-with-q","key":"aIKCIUDBs8"},{"type":"paragraph","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"strong","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"VAz6rXGQ2j"}],"key":"MCwIleDDt6"},{"type":"text","value":" Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"cjX8CoHaaM"}],"key":"It7XoRKaeW"},{"type":"paragraph","position":{"start":{"line":442,"column":1},"end":{"line":442,"column":1}},"children":[{"type":"text","value":"For some intuition into how this method works, recall that we update our parameters according to","position":{"start":{"line":442,"column":1},"end":{"line":442,"column":1}},"key":"L6gJJAARwy"}],"key":"VD2inwlkQo"},{"type":"math","value":"\\begin{aligned}\n \\theta_{t+1} &= \\theta_\\hi + \\eta \\nabla J(\\theta_\\hi) \\\\\n &= \\theta_\\hi + \\eta \\E_{\\tau \\sim \\rho_{\\theta_\\hi}} [\\nabla \\log \\rho_{\\theta_\\hi}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}","position":{"start":{"line":444,"column":1},"end":{"line":449,"column":1}},"html":"θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].\\begin{aligned}\n \\theta_{t+1} &= \\theta_\\hi + \\eta \\nabla J(\\theta_\\hi) \\\\\n &= \\theta_\\hi + \\eta \\E_{\\tau \\sim \\rho_{\\theta_\\hi}} [\\nabla \\log \\rho_{\\theta_\\hi}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].","enumerator":"6.23","key":"AVM9kBkqFd"},{"type":"paragraph","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"text","value":"Consider the “good” trajectories where ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"StXm1ffwZ1"},{"type":"inlineMath","value":"R(\\tau)","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"R(τ)R(\\tau)R(τ)","key":"UTUMnk0dzt"},{"type":"text","value":" is large. Then ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"WcOXQlRr1e"},{"type":"text","value":"θ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"KJAsJ3sCYV"},{"type":"text","value":" gets updated so that these trajectories become more likely. To see why, recall that ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"tN75h2khyF"},{"type":"inlineMath","value":"\\rho_{\\theta}(\\tau)","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"ρθ(τ)\\rho_{\\theta}(\\tau)ρθ(τ)","key":"jktsduoKyV"},{"type":"text","value":" is the likelihood of the trajectory ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"xNKDcuTMQ6"},{"type":"text","value":"τ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"hpbuK7OVQA"},{"type":"text","value":" under the policy ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"eiMbp7WanI"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"ZxG18dZKOS"},{"type":"text","value":" so evaluating the gradient points in the direction that makes ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"xOnzEoNxKt"},{"type":"text","value":"τ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"yBJIxCk90V"},{"type":"text","value":" more likely.","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"louLWmLins"}],"key":"rLS2Cs6rcJ"}],"key":"Qz1BEIADT8"},{"type":"block","position":{"start":{"line":453,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"lRZAPNPflZ"}],"identifier":"baselines-and-advantages","label":"Baselines and advantages","html_id":"baselines-and-advantages","implicit":true,"enumerator":"6.5","key":"H7l0UzsPM4"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":460,"column":1}},"children":[{"type":"text","value":"A central idea from supervised learning is the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"GXcBr2dsXA"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"bias-variance decomposition","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"kkMul6s1xq"}],"key":"qDZf2vqdnk"},{"type":"text","value":",\nwhich shows that the mean squared error of an estimator is the sum of its squared bias and its variance.\nThe REINFORCE gradient estimator ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"hlrn9JnZhG"},{"type":"crossReference","kind":"equation","identifier":"reinforce_pg","label":"reinforce_pg","children":[{"type":"text","value":"(","key":"zTuTcZPECg"},{"type":"text","value":"6.21","key":"iYUbxrKeVR"},{"type":"text","value":")","key":"B1PFbSPAGe"}],"template":"(%s)","enumerator":"6.21","resolved":true,"html_id":"reinforce-pg","key":"v5LQn95HFL"},{"type":"text","value":" is already ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"mC8MmNaU70"},{"type":"emphasis","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"unbiased,","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"FvtCdExLHD"}],"key":"MkvkzlhZrY"},{"type":"text","value":" meaning that its expectation over trajectories is the true policy gradient.\nCan we find ways to reduce its ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"pTTp3QsOsN"},{"type":"emphasis","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"B5bC29hNwK"}],"key":"h4mejISy2z"},{"type":"text","value":" as well?","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"nTUQ091QGN"}],"key":"hgGNiLsXHq"},{"type":"paragraph","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"One common way is to subtract a ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"LFdTjiSD1k"},{"type":"strong","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"baseline function","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"Ly7DeO9w1S"}],"key":"kdlhUt3rDZ"},{"type":"text","value":" ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"RhAnPQvvi2"},{"type":"inlineMath","value":"b_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"bh:SRb_\\hi : \\mathcal{S} \\to \\mathbb{R}bh:SR","key":"eCqIP9EC13"},{"type":"text","value":" at each timestep ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"GpZTruEGTM"},{"type":"inlineMath","value":"\\hi.","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"h.\\hi.h.","key":"DeRceaoCRB"},{"type":"text","value":" This modifies the policy gradient as follows:","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"oA3Vb3LAIt"}],"key":"I30QvaByK8"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n \\left(\n \\sum_{\\hi' = \\hi}^{H-1} r_{\\hi'}\n \\right)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].","position":{"start":{"line":464,"column":1},"end":{"line":474,"column":1}},"identifier":"eq:pg_baseline","label":"eq:pg_baseline","html_id":"eq-pg-baseline","html":"J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n \\left(\n \\sum_{\\hi' = \\hi}^{H-1} r_{\\hi'}\n \\right)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].","enumerator":"6.24","key":"ATiS2yBQ2E"},{"type":"paragraph","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"children":[{"type":"text","value":"For example, we might want ","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"BfJ7Z2C9Sl"},{"type":"inlineMath","value":"b_\\hi","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"html":"bhb_\\hibh","key":"HRbbMMWWfp"},{"type":"text","value":" to estimate the average reward-to-go at a given timestep:","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"Dsnsqq7RdG"}],"key":"vPfeE5kle0"},{"type":"math","value":"b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"html":"bhθ=EτρθRh(τ).b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).bhθ=EτρθRh(τ).","enumerator":"6.25","key":"dInjfwxlMl"},{"type":"paragraph","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"This way, the random variable ","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"fOTb1IVYBY"},{"type":"inlineMath","value":"R_\\hi(\\tau) - b_\\hi^\\theta","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"html":"Rh(τ)bhθR_\\hi(\\tau) - b_\\hi^\\thetaRh(τ)bhθ","key":"qQ2IxnugM2"},{"type":"text","value":" is centered around zero, making certain algorithms more stable.","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"HUe0wT1HtG"}],"key":"eyrx4bxRjV"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"As a better baseline, we could instead choose the ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"JnnC7cgZG1"},{"type":"emphasis","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"value function.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"p1KBm2dbrt"}],"key":"pVIJ9wSSxH"},{"type":"text","value":"\nNote that the random variable ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"ZMUUi6JlCG"},{"type":"inlineMath","value":"Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"Qhπ(s,a)Vhπ(s),Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),Qhπ(s,a)Vhπ(s),","key":"bDLquodMam"},{"type":"text","value":"\nwhere the randomness is taken over the actions, is also centered around zero.\n(Recall ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"pWzl9vlj9U"},{"type":"inlineMath","value":"V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"Vhπ(s)=EaπQhπ(s,a).V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).Vhπ(s)=EaπQhπ(s,a).","key":"huuW0Hj1fO"},{"type":"text","value":")\nIn fact, this quantity has a particular name: the ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"ELhalEmgIH"},{"type":"strong","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"advantage function.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"r80KKRWUi0"}],"key":"XQNzlghPLX"},{"type":"text","value":"\nThis measures how much better this action does than the average for that policy.\n(Note that for an optimal policy ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"hox6NHwUYG"},{"type":"inlineMath","value":"\\pi^\\star,","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"π,\\pi^\\star,π,","key":"yt6XZ4n0T2"},{"type":"text","value":" the advantage of a given state-action pair is always zero or negative.)","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"MSor9qXHTl"}],"key":"J0zjJfN3We"},{"type":"paragraph","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"children":[{"type":"text","value":"We can now express the policy gradient as follows. Note that the advantage function effectively replaces the ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"IYS4dNlniS"},{"type":"inlineMath","value":"Q","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"html":"QQQ","key":"tUxBog5fmj"},{"type":"text","value":"-function from ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"geRtbo8Ebf"},{"type":"crossReference","kind":"equation","identifier":"pg_with_q","label":"pg_with_q","children":[{"type":"text","value":"(","key":"MQrMCrqZtH"},{"type":"text","value":"6.22","key":"TuvMAq1DtZ"},{"type":"text","value":")","key":"IsbmIQQTVy"}],"template":"(%s)","enumerator":"6.22","resolved":true,"html_id":"pg-with-q","key":"AR7SuHe1zO"},{"type":"text","value":":","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"ZD2yuXB2Wy"}],"key":"cXno6otOBq"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].","label":"pg_advantage","identifier":"pg_advantage","html":"J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].","enumerator":"6.26","html_id":"pg-advantage","key":"TirDKOSV3m"},{"type":"paragraph","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Note that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories:","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"vuqvtO3ZNb"}],"key":"V7xnxHwobE"},{"type":"comment","value":" TODO could use more explanation _why_ we want to avoid correlations ","key":"pNe4PKuHZl"},{"type":"proof","kind":"definition","label":"pg_baseline","identifier":"pg_baseline","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy gradient with a learned baseline","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"key":"Su37YvvHwC"}],"key":"NlcjAczGnB"},{"type":"code","lang":"python","value":"def pg_with_learned_baseline_pseudocode(env, π, η, θ_init, K, N):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), N)\n V_hat = fit(trajectories) # estimates the value function of π(θ)\n τ = sample_trajectories(env, π(θ), 1)\n g = jnp.zeros_like(θ) # gradient estimator\n\n for h, (s, a) in enumerate(τ):\n def log_likelihood(θ_):\n return jnp.log(π(θ_)(s, a))\n g = g + jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s))\n \n θ = θ + η * g\n return θ","position":{"start":{"line":507,"column":1},"end":{"line":523,"column":1}},"key":"nbXtenzRrL"},{"type":"paragraph","position":{"start":{"line":525,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"text","value":"Note that you could also generalize this by allowing the learning rate ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"vdiqfDo6di"},{"type":"text","value":"η","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"iCpGd54pFZ"},{"type":"text","value":" to vary across steps,\nor take multiple trajectories ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"Hc3DfqxuV0"},{"type":"text","value":"τ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"cmhZBecm2B"},{"type":"text","value":" and compute the sample average of the gradient estimates.","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"g9ZPwmk0Sd"}],"key":"Vhb4mRmjgp"},{"type":"paragraph","position":{"start":{"line":528,"column":1},"end":{"line":529,"column":1}},"children":[{"type":"text","value":"The baseline estimation step ","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"eRhwxsMPLB"},{"type":"inlineCode","value":"fit","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"VQrxALIrMo"},{"type":"text","value":" can be done using any appropriate supervised learning algorithm.\nNote that the gradient estimator will be unbiased regardless of the baseline.","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"a9SKtnZiZ1"}],"key":"ijNNEY5G2o"}],"enumerator":"6.2","html_id":"pg-baseline","key":"tQPIVc9SLK"}],"key":"ImwNCOXuuP"},{"type":"block","position":{"start":{"line":532,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"Comparing policy gradient algorithms to policy iteration","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"oc7iKUsmXM"}],"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","label":"Comparing policy gradient algorithms to policy iteration","html_id":"comparing-policy-gradient-algorithms-to-policy-iteration","implicit":true,"enumerator":"6.6","key":"RldhViIA32"},{"type":"comment","value":" TODO maybe restructure this part ","key":"xgx4yZRTS2"},{"type":"paragraph","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"What advantages does the policy gradient algorithm have over ","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"DrW7zwNghh"},{"type":"crossReference","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Section ","key":"KwEhtgLson"},{"type":"text","value":"1.5.3.2","key":"DN66xf9Imh"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"VtdoGvulSz"},{"type":"text","value":"?","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"D7EVRvZGKc"}],"key":"MYnD8DNYjL"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy iteration recap","position":{"start":{"line":540,"column":1},"end":{"line":540,"column":1}},"key":"Uokp1uSCrZ"}],"key":"qMfY1MAnwK"},{"type":"paragraph","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Recall that policy iteration is an algorithm for MDPs with unknown state transitions where we alternate between these two steps:","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"Bx5iKtWkxF"}],"key":"oxgGyQqdoH"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":543,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"Estimating the ","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"FKBBao3scK"},{"type":"inlineMath","value":"Q","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"html":"QQQ","key":"YXt4sdRXRL"},{"type":"text","value":"-function (or advantage function) of the current policy;","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"PxfRBjY1ec"}],"key":"f82Aptel1H"},{"type":"listItem","spread":true,"position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Updating the policy to be greedy w.r.t. this approximate ","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"Y3yyIx8muP"},{"type":"inlineMath","value":"Q","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"html":"QQQ","key":"KQSONc69RT"},{"type":"text","value":"-function (or advantage function).","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"Ci8WA6YdiS"}],"key":"ApRMKRwDHP"}],"key":"RzuxUo1Wpq"}],"key":"QFAPR5tXs9"},{"type":"paragraph","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"To analyze the difference between them, we’ll make use of the ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"W6S7n8rCQs"},{"type":"strong","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"performance difference lemma","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"pYHSHM4Ae6"}],"key":"eiNL4hu8cc"},{"type":"text","value":", which provides an expression for comparing the difference between two value functions.","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"hpiTmGJTCC"}],"key":"uN4cOfUF5l"},{"type":"proof","kind":"theorem","label":"pdl","identifier":"pdl","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance difference lemma","position":{"start":{"line":549,"column":1},"end":{"line":549,"column":1}},"key":"zzGiHyHcBl"}],"key":"uQAnw3tv3Q"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":555,"column":1}},"children":[{"type":"text","value":"Suppose Alice is playing a game (an MDP).\nBob is spectating, and can evaluate how good an action is compared to his own strategy.\n(That is, Bob can compute his ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"ol7ueldzvN"},{"type":"emphasis","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"children":[{"type":"text","value":"advantage function","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"B9yZFZ44xb"}],"key":"vAjbWOPtk1"},{"type":"text","value":" ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"HyIwpEVV7N"},{"type":"inlineMath","value":"A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"AhBob(sh,ah)A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)AhBob(sh,ah)","key":"WNxihSOPp1"},{"type":"text","value":").\nThe performance difference lemma says that Bob can now calculate exactly how much better or worse he is than Alice as follows:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"jz1PcMdYGS"}],"key":"eMCYhxPYz7"},{"type":"math","value":"V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]","label":"pdl_eq","identifier":"pdl_eq","html":"V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]","enumerator":"6.27","html_id":"pdl-eq","key":"ZHH61ugDeX"},{"type":"paragraph","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"uAAQ5qgGXU"},{"type":"inlineMath","value":"\\rho_{\\text{Alice}, s}","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"html":"ρAlice,s\\rho_{\\text{Alice}, s}ρAlice,s","key":"heYmK9TRvG"},{"type":"text","value":" denotes the distribution over trajectories starting in state ","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"xPwzpBf1Ww"},{"type":"inlineMath","value":"s","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"html":"sss","key":"isFmIZSFAK"},{"type":"text","value":" when Alice is playing.","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"RCrv0altLe"}],"key":"l8LbP6PfBP"},{"type":"paragraph","position":{"start":{"line":564,"column":1},"end":{"line":566,"column":1}},"children":[{"type":"text","value":"To see why, consider just a single step ","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"key":"KYAdNz7jBZ"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"html":"h\\hih","key":"UKJ8rCDlQl"},{"type":"text","value":" of the trajectory.\nAt this step we compute how much better actions from Bob are than the actions from Alice, on average.\nBut this is exactly the average Bob-advantage across actions from Alice, as described in the PDL!","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"key":"xy8JZyuVSM"}],"key":"eYypnRDNgG"},{"type":"paragraph","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"children":[{"type":"text","value":"Formally, this corresponds to a nice telescoping simplification when we expand out the definition of the advantage function. Note that","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"key":"ErsdfAQC46"}],"key":"amzfhmD5UG"},{"type":"math","value":"\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}","position":{"start":{"line":570,"column":1},"end":{"line":575,"column":1}},"html":"Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)","enumerator":"6.28","key":"jcYk8byagl"},{"type":"paragraph","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"so expanding out the r.h.s. expression of ","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"cyhLuvrUpw"},{"type":"crossReference","kind":"equation","identifier":"pdl_eq","label":"pdl_eq","children":[{"type":"text","value":"(","key":"hHZsWYD8vN"},{"type":"text","value":"6.27","key":"jitlvLSzeS"},{"type":"text","value":")","key":"ebJsiDu2TA"}],"template":"(%s)","enumerator":"6.27","resolved":true,"html_id":"pdl-eq","key":"lelGTRB2Zh"},{"type":"text","value":" and grouping terms together gives","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"FSeqgBEYuN"}],"key":"OMtDiyAuKE"},{"type":"math","value":"\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}","position":{"start":{"line":579,"column":1},"end":{"line":584,"column":1}},"html":"EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)","enumerator":"6.29","key":"qWoWvnH3vF"},{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"as desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"U7PlbJgWID"}],"key":"bR81iWo9L4"}],"enumerator":"6.1","html_id":"pdl","key":"RzCpZLp1ZN"},{"type":"paragraph","position":{"start":{"line":589,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting.\nTo see why, let’s consider a single iteration of policy iteration, where policy ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"uc1ee3DdgR"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"dJJOB9gzwG"},{"type":"text","value":" gets updated to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"EQtjutZjKV"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"E1N30YpBk4"},{"type":"text","value":". We’ll assume these policies are deterministic.\nSuppose the new policy ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"C6lk9B0srg"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"l8ogIsf2aG"},{"type":"text","value":" chooses some action with a negative advantage with respect to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"pg31rc1F1R"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Unr5BrwyAJ"},{"type":"text","value":".\nThat is, when acting according to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"jPqKObSNHE"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"uMZjBadMzH"},{"type":"text","value":", taking the action from ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"eTepuPFhMT"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"lLwuLewkl0"},{"type":"text","value":" would perform worse than expected.\nDefine ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"KVcOXCA3Df"},{"type":"inlineMath","value":"\\Delta_\\infty","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"Δ\\Delta_\\inftyΔ","key":"ooZzZweoqT"},{"type":"text","value":" to be the most negative advantage, that is, ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Pj8wG5x42D"},{"type":"inlineMath","value":"\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"Δ=minsSAhπ(s,π~(s))\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))Δ=minsSAhπ(s,π~(s))","key":"g3QKEmusT4"},{"type":"text","value":".\nPlugging this into the ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Q6vEwS4bUm"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"URP3tiGd0A"},{"type":"text","value":"6.1","key":"p8if8hE5nC"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","key":"O7KqGF5qPe"},{"type":"text","value":" gives","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"wLnlK3N4Ui"}],"key":"H712131XjZ"},{"type":"math","value":"\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}","position":{"start":{"line":596,"column":1},"end":{"line":604,"column":1}},"html":"V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}V0π~(s)V0π(s)V0π~(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π(s)HΔ∣.","enumerator":"6.30","key":"ax9bED4yWt"},{"type":"paragraph","position":{"start":{"line":606,"column":1},"end":{"line":612,"column":1}},"children":[{"type":"text","value":"That is, for some state ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"KmbCnQ8id4"},{"type":"inlineMath","value":"s","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"sss","key":"aJI3IgmS46"},{"type":"text","value":", the lower bound on the performance of ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"wkRImZDac7"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"π~\\tilde \\piπ~","key":"PYVvNTjizF"},{"type":"text","value":" is ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"Ylip1Xomjd"},{"type":"emphasis","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"lower","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"pm9lJM5V4R"}],"key":"V8mwIGP3Fv"},{"type":"text","value":" than the performance of ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"VYzcRK9WW3"},{"type":"text","value":"π","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"tq9kR4Bd8X"},{"type":"text","value":".\nThis doesn’t state that ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"wxaQNmzOrY"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"π~\\tilde \\piπ~","key":"hIBRha22Wr"},{"type":"text","value":" ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"HhogVgf4nV"},{"type":"emphasis","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"will","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"bUKTqR9W7I"}],"key":"j4edBY0K5e"},{"type":"text","value":" necessarily perform worse than ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"ZgLsWI8ECy"},{"type":"text","value":"π","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"fiU5Qzf4vt"},{"type":"text","value":",\nonly suggests that it might be possible.\nIf these worst case states do exist, though,\nPI does not avoid situations where the new policy often visits them;\nIt does not enforce that the trajectory distributions ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"zp0fkB4qgS"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"PejLZ8hp5x"},{"type":"text","value":" and ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"wpGIGIrzH4"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"UqIn1tHM7Z"},{"type":"text","value":" be close to each other.\nIn other words, the “training distribution” that our prediction rule is fitted on, ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"evVfJ1oIBf"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"jCp0a8uYNm"},{"type":"text","value":", may differ significantly from the “evaluation distribution” ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"Ik7QKXPnS7"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"cE7AUWjaVk"},{"type":"text","value":".","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"HLRAVTmCJt"}],"key":"tav22ztYBK"},{"type":"comment","value":" \nThis is an instance of *distributional shift*.\nTo begin, let's ask, where *do* fitted approaches work well?\nThey are commonly seen in SL,\nwhere a prediction rule is fit using some labelled training set,\nand then assessed on a test set from the same distribution.\nBut policy iteration isn't performed in the same scenario:\nthere is now _distributional shift_ between the different iterations of the policy. ","key":"CixzaWxKf3"},{"type":"paragraph","position":{"start":{"line":623,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"On the other hand, policy gradient methods ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"fnTqxPiLnM"},{"type":"emphasis","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"do","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"zrR3CshO6V"}],"key":"bvX2F8KsVn"},{"type":"text","value":", albeit implicitly,\nencourage ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"qJBUlZ96aX"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"U6YTBnPkPQ"},{"type":"text","value":" and ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"dOZTqQ9vpK"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"SNjUrOAsfq"},{"type":"text","value":" to be similar.\nSuppose that the mapping from policy parameters to trajectory distributions is relatively smooth.\nThen, by adjusting the parameters only a small distance,\nthe new policy will also have a similar trajectory distribution.\nBut this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth.\nCan we constrain the distance between the resulting distributions more ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"zY9tyL4ODL"},{"type":"emphasis","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"AErO3rwL2l"}],"key":"PGbOYB6Ip2"},{"type":"text","value":"?","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"C1osq0gQMn"}],"key":"mBIRpjGO0m"},{"type":"paragraph","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"children":[{"type":"text","value":"This brings us to the next three methods:","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"HZZhAFJeeS"}],"key":"r3ffsYBwrj"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":632,"column":1},"end":{"line":635,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"strong","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"trust region policy optimization","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"key":"wugO0ZBV4O"}],"key":"YQhEUB7Oka"},{"type":"text","value":" (TRPO), which explicitly constrains the difference between the distributions before and after each step;","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"key":"DdPY1YM6sP"}],"key":"u5vlSAcVQt"},{"type":"listItem","spread":true,"position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"the ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"zWkQFHhkhL"},{"type":"strong","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"Y6vGnEM4Yj"}],"key":"M38v39IgVn"},{"type":"text","value":" (NPG), a first-order approximation of TRPO;","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"tGlT63pfTR"}],"key":"d1KGGYIojB"},{"type":"listItem","spread":true,"position":{"start":{"line":634,"column":1},"end":{"line":635,"column":1}},"children":[{"type":"strong","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"proximal policy optimization","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"Gpb48Iw6zC"}],"key":"rzkfuV0NIT"},{"type":"text","value":" (PPO), a “soft relaxation” of TRPO.","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"yyzd7SO9US"}],"key":"qkgzbLZtUK"}],"key":"sY9XfIfK9Y"}],"key":"PRixxeKBxg"},{"type":"block","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":638,"column":1},"end":{"line":638,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":638,"column":1},"end":{"line":638,"column":1}},"key":"IF5SJxfXbg"}],"identifier":"trust-region-policy-optimization","label":"Trust region policy optimization","html_id":"trust-region-policy-optimization","implicit":true,"enumerator":"6.7","key":"lQpkGyUFTc"},{"type":"paragraph","position":{"start":{"line":640,"column":1},"end":{"line":644,"column":1}},"children":[{"type":"text","value":"We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration.\nCan we design an algorithm that ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"P9rGaRFdY4"},{"type":"emphasis","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"kkI1Fy0bmX"}],"key":"iNyStpn2ee"},{"type":"text","value":" constrains the “step size”?\nThat is, we want to ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"AaYsjhIkGt"},{"type":"emphasis","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"w3YTJvEK64"}],"key":"cnKJv4cMzQ"},{"type":"text","value":" the policy as much as possible,\nmeasured in terms of the r.h.s. of the ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"grw9hdP2Ru"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"T7ULg7crsJ"},{"type":"text","value":"6.1","key":"rhQpzVxMBj"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","key":"udVzhoV8a8"},{"type":"text","value":",\nwhile ensuring that its trajectory distribution does not change too much:","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"yQBEG8eD5a"}],"key":"XI3yhmdfYo"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}","position":{"start":{"line":646,"column":1},"end":{"line":651,"column":1}},"html":"θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}θk+1argθoptmaxEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ","enumerator":"6.31","key":"DGziRgUinX"},{"type":"paragraph","position":{"start":{"line":653,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"Note that we have made a small change to the r.h.s. expression:\nwe use the ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"gFfbY30211"},{"type":"emphasis","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"states","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"V675nMIPty"}],"key":"m4HWGjVfyI"},{"type":"text","value":" sampled from the old policy, and only use the ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"HHjFVFImbl"},{"type":"emphasis","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"R1ewsPv8yH"}],"key":"PtUuCEAP3L"},{"type":"text","value":" from the new policy.\nIt would be computationally infeasible to sample entire trajectories from ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"Zozj2zdHTC"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"npFxJd3PId"},{"type":"text","value":" as we are optimizing over ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"W3fyre3GyC"},{"type":"text","value":"θ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"G51LkR0PuS"},{"type":"text","value":".\nOn the other hand, if ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"EzwsCrNdEF"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"Vtsyod7g7u"},{"type":"text","value":" returns a vector representing a probability distribution over actions,\nthen evaluating the expected advantage with respect to this distribution only requires taking a dot product.\nThis approximation also matches the r.h.s. of the PDL to first order in ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"BMmytwKwvc"},{"type":"text","value":"θ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"dTHi2pJm1L"},{"type":"text","value":".\n(We will elaborate more on this later.)","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"shiAu0O2P6"}],"key":"zwMQUoHayI"},{"type":"paragraph","position":{"start":{"line":661,"column":1},"end":{"line":662,"column":1}},"children":[{"type":"text","value":"How do we describe the distance between ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"QSeAGToGKj"},{"type":"inlineMath","value":"\\rho_{\\theta^{\\text{opt}}}","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"html":"ρθopt\\rho_{\\theta^{\\text{opt}}}ρθopt","key":"E43DZyyjPx"},{"type":"text","value":" and ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"Aai7oyIcDk"},{"type":"inlineMath","value":"\\rho_{\\theta^k}","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"html":"ρθk\\rho_{\\theta^k}ρθk","key":"OznNtNzi2d"},{"type":"text","value":"?\nWe’ll use the ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"sHXmP6snck"},{"type":"strong","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence (KLD)","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"CnpAqNyaSb"}],"key":"MBIplEGuKG"},{"type":"text","value":":","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"ugE0CQEl9j"}],"key":"JqQVXPl2x6"},{"type":"proof","kind":"definition","label":"kld","identifier":"kld","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"GKy9HoDnw1"}],"key":"pJRZjA5O09"},{"type":"paragraph","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"For two PDFs ","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"key":"SQjAj8fXyf"},{"type":"inlineMath","value":"p, q","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"html":"p,qp, qp,q","key":"RJnThomTz6"},{"type":"text","value":",","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"key":"mR1LP1uPle"}],"key":"auXZOaqhcn"},{"type":"math","value":"\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]","position":{"start":{"line":669,"column":1},"end":{"line":669,"column":1}},"html":"KL(pq):=Exp[logp(x)q(x)]\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]KL(pq):=Exp[logq(x)p(x)]","enumerator":"6.32","key":"F6URrukDPp"},{"type":"paragraph","position":{"start":{"line":671,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"This can be interpreted in many different ways, many stemming from information theory.\nOne such interpretation is that ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"mmQaxTPwQY"},{"type":"inlineMath","value":"\\kl{p}{q}","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"KL(pq)\\kl{p}{q}KL(pq)","key":"CTscU7UMMf"},{"type":"text","value":" describes my average “surprise” if I ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"rQqScUhPzy"},{"type":"emphasis","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"think","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"MKn0P4auw1"}],"key":"CQp2kTnmA6"},{"type":"text","value":" data is being generated by ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"oOzU3CnqiQ"},{"type":"inlineMath","value":"q","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"qqq","key":"iB6yv7pmzG"},{"type":"text","value":" but it’s actually generated by ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"aIVvzrJQ0J"},{"type":"inlineMath","value":"p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"ppp","key":"UJSDFsEYZ2"},{"type":"text","value":".\n(The ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"fHhLyDGr1a"},{"type":"strong","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"surprise","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"yzpMHvMuwz"}],"key":"wt9vAzMUyJ"},{"type":"text","value":" of an event with probability ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Fyvb88nyxJ"},{"type":"inlineMath","value":"p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"ppp","key":"z3RJwmxQw4"},{"type":"text","value":" is ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"VCif6va18A"},{"type":"inlineMath","value":"- \\log_2 p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"log2p- \\log_2 plog2p","key":"Plv9wiI7zM"},{"type":"text","value":".)\nNote that ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"srRED7wo0j"},{"type":"inlineMath","value":"\\kl{p}{q} = 0","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"KL(pq)=0\\kl{p}{q} = 0KL(pq)=0","key":"UZRQr9HJxq"},{"type":"text","value":" if and only if ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"XX0jHZaS2y"},{"type":"inlineMath","value":"p = q","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"p=qp = qp=q","key":"z1K6BVLmEQ"},{"type":"text","value":". Also note that it is generally ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Cml9CISKYr"},{"type":"emphasis","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"NEkrTce9rT"}],"key":"m3VOLGbVZp"},{"type":"text","value":" symmetric.","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Wx1fQd4uVD"}],"key":"AKE6DKLCb5"}],"enumerator":"6.3","html_id":"kld","key":"B3I8CaICBQ"},{"type":"paragraph","position":{"start":{"line":677,"column":1},"end":{"line":680,"column":1}},"children":[{"type":"text","value":"Both the objective function and the KLD constraint involve a weighted average over the space of all trajectories.\nThis is intractable in general, so we need to estimate the expectation.\nAs before, we can do this by taking an empirical average over samples from the trajectory distribution.\nThis gives us the following pseudocode:","position":{"start":{"line":677,"column":1},"end":{"line":677,"column":1}},"key":"jAFUzXgq5J"}],"key":"E5cuqN4qAS"},{"type":"proof","kind":"definition","label":"trpo","identifier":"trpo","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trust region policy optimization (exact)","position":{"start":{"line":682,"column":1},"end":{"line":682,"column":1}},"key":"DtpAioV18Y"}],"key":"aoNCHhhI4f"},{"type":"code","lang":"python","value":"def trpo_pseudocode(env, δ, θ_init, M):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), M)\n A_hat = fit(trajectories)\n \n def approximate_gain(θ_):\n total_advantage = 0\n for τ in trajectories:\n for s, _a, _r in τ:\n for a in env.action_space:\n total_advantage += π(θ)(s, a) * A_hat(s, a)\n return total_advantage\n \n def constraint(θ_):\n kl_div = 0\n for τ in trajectories:\n for s, a, _r in τ:\n kl_div += jnp.log(π(θ)(s, a)) - jnp.log(π(θ_)(s, a))\n return kl_div <= δ\n \n θ = optimize(approximate_gain, constraint)\n\n return θ","position":{"start":{"line":686,"column":1},"end":{"line":711,"column":1}},"key":"RX5aW5o1A4"}],"enumerator":"6.4","html_id":"trpo","key":"qTEyevmr6D"},{"type":"comment","value":"\nApplying importance sampling allows us to estimate the TRPO objective as follows:\n\n::::{prf:definition} Trust region policy optimization (implementation)\n:label: trpo_implement\n\n:::{prf:definitionic} TODO\nInitialize $\\theta^0$\n\nSample $N$ trajectories from $\\rho^k$ to learn a value estimator $\\tilde b_\\hi(s) \\approx V^{\\pi^k}_\\hi(s)$\n\nSample $M$ trajectories $\\tau_0, \\dots, \\tau_{M-1} \\sim \\rho^k$\n\n$$\\begin{gathered}\n \\theta^{k+1} \\gets \\arg\\max_{\\theta} \\frac{1}{M} \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} [ R_\\hi(\\tau_m) - \\tilde b_\\hi(s_\\hi) ] \\\\\n \\text{where } \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\frac{\\pi_k(a_\\hi^m \\mid s_\\hi^m)}{\\pi_\\theta(a_\\hi^m \\mid s_\\hi^m)} \\le \\delta\n \n\\end{gathered}$$\n:::\n:::: ","key":"H0KWZ9H37k"},{"type":"paragraph","position":{"start":{"line":735,"column":1},"end":{"line":742,"column":1}},"children":[{"type":"text","value":"The above isn’t entirely complete:\nwe still need to solve the actual optimization problem at each step.\nUnless we know additional properties of the problem,\nthis might be an intractable optimization.\nDo we need to solve it exactly, though?\nInstead, if we assume that both the objective function and the constraint are somewhat smooth in terms of the policy parameters,\nwe can use their ","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"kzkF8qujl6"},{"type":"emphasis","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"children":[{"type":"text","value":"Taylor expansions","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"xJRbxDG7KZ"}],"key":"XPxtpEnnmO"},{"type":"text","value":" to give us a simpler optimization problem with a closed-form solution.\nThis brings us to the ","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"IZyIpS9Rcy"},{"type":"strong","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"SQqRRdePCP"}],"key":"FOCOoWxWqt"},{"type":"text","value":" algorithm.","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"JxXaGHInMN"}],"key":"hQf4aOokzQ"}],"key":"ysv0OF71GF"},{"type":"block","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":746,"column":1},"end":{"line":746,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":746,"column":1},"end":{"line":746,"column":1}},"key":"vHkUQVnOKA"}],"identifier":"natural-policy-gradient","label":"Natural policy gradient","html_id":"natural-policy-gradient","implicit":true,"enumerator":"6.8","key":"IlPnB9GvH2"},{"type":"paragraph","position":{"start":{"line":748,"column":1},"end":{"line":749,"column":1}},"children":[{"type":"text","value":"We take a ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"Anr8f0ciHr"},{"type":"emphasis","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"JAypcoZkiM"}],"key":"XNUwwGvEEH"},{"type":"text","value":" (first-order) approximation to the objective function and a ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"Kuf3gN5xPo"},{"type":"emphasis","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"children":[{"type":"text","value":"quadratic","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"pC7JVSs9u6"}],"key":"VzJ4lLy1Pd"},{"type":"text","value":" (second-order) approximation to the KL divergence constraint about the current estimate ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"Y2mq4acCu9"},{"type":"inlineMath","value":"\\theta^k","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"html":"θk\\theta^kθk","key":"CqAQ4W4Lwd"},{"type":"text","value":".\nThis results in the optimization problem","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"wzvAw5cYgn"}],"key":"badIBDpRYf"},{"type":"math","value":"\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}","label":"npg_optimization","identifier":"npg_optimization","html":"maxθθJ(πθk)(θθk)where 12(θθk)Fθk(θθk)δ\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}θmaxθJ(πθk)(θθk)where 21(θθk)Fθk(θθk)δ","enumerator":"6.33","html_id":"npg-optimization","key":"KUbl5IbFqa"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"CAGm8fNrGl"},{"type":"inlineMath","value":"F_{\\theta^k}","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"FθkF_{\\theta^k}Fθk","key":"h9i544EHwZ"},{"type":"text","value":" is the ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"GTkkGPyyao"},{"type":"strong","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"JGQkrTHvP9"}],"key":"Oz957TR2ju"},{"type":"text","value":" defined below.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"zGcTPqFYLx"}],"key":"Rxwps8gPnG"},{"type":"proof","kind":"definition","label":"fisher_matrix","identifier":"fisher_matrix","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"hFF2vX97TI"}],"key":"dumlaAHhBL"},{"type":"paragraph","position":{"start":{"line":765,"column":1},"end":{"line":766,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"LzZROT2I6z"},{"type":"inlineMath","value":"p_\\theta","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"pθp_\\thetapθ","key":"W0TQlBkcFG"},{"type":"text","value":" denote a parameterized distribution.\nIts Fisher information matrix ","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"S9qdgonwK3"},{"type":"inlineMath","value":"F_\\theta","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"FθF_\\thetaFθ","key":"b3fpGXD68c"},{"type":"text","value":" can be defined equivalently as:","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"I3nigdImkM"}],"key":"aXDRrGTCBr"},{"type":"math","value":"\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}","position":{"start":{"line":768,"column":1},"end":{"line":773,"column":1}},"html":"Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]covariance matrix of the Fisher score=Expθ[θ2logpθ(x)]average Hessian of the negative log-likelihood\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]=Expθ[θ2logpθ(x)]covariance matrix of the Fisher scoreaverage Hessian of the negative log-likelihood","enumerator":"6.34","key":"aHAg7n2Yob"},{"type":"paragraph","position":{"start":{"line":775,"column":1},"end":{"line":778,"column":1}},"children":[{"type":"text","value":"Recall that the Hessian of a function describes its curvature:\nfor a vector ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"kI1qktJNcZ"},{"type":"inlineMath","value":"\\delta \\in \\Theta","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"html":"δΘ\\delta \\in \\ThetaδΘ","key":"Kd3FMojVPx"},{"type":"text","value":",\nthe quantity ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"VQcNXOg43o"},{"type":"inlineMath","value":"\\delta^\\top F_\\theta \\delta","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"html":"δFθδ\\delta^\\top F_\\theta \\deltaδFθδ","key":"GGRVlkn3Mu"},{"type":"text","value":" describes how rapidly the negative log-likelihood changes if we move by ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"akePYIRTEg"},{"type":"text","value":"δ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"IbtrFC5ACc"},{"type":"text","value":".\nThe Fisher information matrix is precisely the Hessian of the KL divergence (with respect to either one of the parameters).","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"fqClqzUBuG"}],"key":"nLzIQWZd6U"},{"type":"paragraph","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"children":[{"type":"text","value":"In particular, when ","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"key":"ujoCaCNuUF"},{"type":"inlineMath","value":"p_\\theta = \\rho_{\\theta}","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"html":"pθ=ρθp_\\theta = \\rho_{\\theta}pθ=ρθ","key":"wYZ8B9aLQ2"},{"type":"text","value":" denotes a trajectory distribution, we can further simplify the expression:","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"key":"fDkqSVpXE4"}],"key":"UtMsx7OnsE"},{"type":"math","value":"F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]","label":"fisher_trajectory","identifier":"fisher_trajectory","html":"Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]","enumerator":"6.35","html_id":"fisher-trajectory","key":"jefnGM6X1A"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"children":[{"type":"text","value":"Note that we’ve used the Markov property to cancel out the cross terms corresponding to two different time steps.","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"HW80hJDF1a"}],"key":"lLMMONtIhN"}],"enumerator":"6.5","html_id":"fisher-matrix","key":"fKvEnrODVn"},{"type":"paragraph","position":{"start":{"line":791,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"This is a convex optimization problem with a closed-form solution.\nTo see why, it helps to visualize the case where ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"uCyQi2h4DF"},{"type":"text","value":"θ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"EoT0degf8u"},{"type":"text","value":" is two-dimensional:\nthe constraint describes the inside of an ellipse,\nand the objective function is linear,\nso we can find the extreme point on the boundary of the ellipse.\nWe recommend ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"zCJKPMdfVp"},{"type":"cite","kind":"narrative","label":"boyd_convex_2004","identifier":"boyd_convex_2004","children":[{"type":"text","value":"Boyd & Vandenberghe (2004)","key":"nq2U9mvcVl"}],"enumerator":"1","key":"dOwyrcdb8H"},{"type":"text","value":" for a comprehensive treatment of convex optimization.","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"eV0XVq6Fpw"}],"key":"wRM2LOqLEL"},{"type":"paragraph","position":{"start":{"line":798,"column":1},"end":{"line":799,"column":1}},"children":[{"type":"text","value":"More generally, for a higher-dimensional ","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"nEdmV54M6m"},{"type":"text","value":"θ","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"IcYfW0FyFF"},{"type":"text","value":",\nwe can compute the global optima by setting the gradient of the Lagrangian to zero:","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"Gc1VxE9TAS"}],"key":"nQW5QSyjC6"},{"type":"math","value":"\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}","position":{"start":{"line":801,"column":1},"end":{"line":809,"column":1}},"html":"L(θ,α)=J(πθk)(θθk)α[12(θθk)Fθk(θθk)δ]L(θk+1,α):=0    J(πθk)=αFθk(θk+1θk)θk+1=θk+ηFθk1J(πθk)where η=2δJ(πθk)Fθk1J(πθk)\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}L(θ,α)L(θk+1,α)J(πθk)θk+1where η=J(πθk)(θθk)α[21(θθk)Fθk(θθk)δ]:=0=αFθk(θk+1θk)=θk+ηFθk1J(πθk)=J(πθk)Fθk1J(πθk)2δ","enumerator":"6.36","key":"PwWk7txzNI"},{"type":"paragraph","position":{"start":{"line":811,"column":1},"end":{"line":813,"column":1}},"children":[{"type":"text","value":"This gives us the closed-form update.\nNow the only challenge is to estimate the Fisher information matrix,\nsince, as with the KL divergence constraint, it is an expectation over trajectories, and computing it exactly is therefore typically intractable.","position":{"start":{"line":811,"column":1},"end":{"line":811,"column":1}},"key":"exy9Peb4d4"}],"key":"TrVdGSMtft"},{"type":"proof","kind":"definition","label":"npg","identifier":"npg","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"rybOC8yLRN"}],"key":"vd5VagFd1K"},{"type":"paragraph","position":{"start":{"line":818,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"How many trajectory samples do we need to accurately estimate the Fisher information matrix?\nAs a rule of thumb, the sample complexity should scale with the dimension of the parameter space.\nThis makes this approach intractable in the deep learning setting where we might have a very large number of parameters.","position":{"start":{"line":818,"column":1},"end":{"line":818,"column":1}},"key":"UlLOr5BQdX"}],"key":"DxNKGpEswN"}],"enumerator":"6.6","html_id":"npg","key":"c8gJTWAW4w"},{"type":"paragraph","position":{"start":{"line":823,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"As you can see, the NPG is the “basic” policy gradient algorithm we saw above,\nbut with the gradient transformed by the inverse Fisher information matrix.\nThis matrix can be understood as accounting for the ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"f8msndHYKo"},{"type":"strong","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"geometry of the parameter space.","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"UXZB3rYckN"}],"key":"REcN5l7gQq"},{"type":"text","value":"\nThe typical gradient descent algorithm implicitly measures distances between parameters using the typical ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"VkSHhhu9Ku"},{"type":"emphasis","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Euclidean distance","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"VNlfX2txue"}],"key":"AFi8ZYoR6Z"},{"type":"text","value":".\nHere, where the parameters map to a ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"yWJu6zuLQl"},{"type":"emphasis","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"distribution","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"MXu7bgvkJl"}],"key":"mLWRghyQTs"},{"type":"text","value":", using the natural gradient update is equivalent to optimizing over ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"lo1I6ZLifX"},{"type":"strong","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"distribution space","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"Xa4tTbfwDR"}],"key":"GaCSdYoU84"},{"type":"text","value":" rather than parameter space,\nwhere distance between distributions is measured by the ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"U0wBhPAkdZ"},{"type":"crossReference","kind":"proof:definition","identifier":"kld","label":"kld","children":[{"type":"text","value":"Definition ","key":"f8I7JL9QNJ"},{"type":"text","value":"6.3","key":"tcDf0BO8Q9"}],"template":"Definition %s","enumerator":"6.3","resolved":true,"html_id":"kld","key":"uxuwHxbUR5"},{"type":"text","value":".","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"kWFbsF1e2n"}],"key":"Q4qm4sg4in"},{"type":"proof","kind":"example","label":"natural_simple","identifier":"natural_simple","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural gradient on a simple problem","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"PQaeBwENF5"}],"key":"oxvqxncDTS"},{"type":"paragraph","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"children":[{"type":"text","value":"Let’s step away from RL and consider the following optimization problem over Bernoulli distributions ","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"key":"fJuKpsUnWt"},{"type":"inlineMath","value":"\\pi \\in \\Delta(\\{ 0, 1 \\})","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"html":"πΔ({0,1})\\pi \\in \\Delta(\\{ 0, 1 \\})πΔ({0,1})","key":"lLzQPoICUV"},{"type":"text","value":":","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"key":"YazO8ENgRo"}],"key":"OlzPp8YnHQ"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}","position":{"start":{"line":835,"column":1},"end":{"line":839,"column":1}},"html":"J(π)=100π(1)+1π(0)\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}J(π)=100π(1)+1π(0)","enumerator":"6.37","key":"KBTNV2QO7B"},{"type":"paragraph","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"We can think of the space of such distributions as the line between ","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"CyjSfhFNw3"},{"type":"inlineMath","value":"(0, 1)","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"html":"(0,1)(0, 1)(0,1)","key":"KpuK2HYZs3"},{"type":"text","value":" to ","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"UVH67clDLW"},{"type":"inlineMath","value":"(1, 0)","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"html":"(1,0)(1, 0)(1,0)","key":"vafMQPCaI6"},{"type":"text","value":" on the Cartesian plane:","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"m6Saobak5w"}],"key":"rdMQWwyVZp"},{"type":"image","url":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","alt":"a line from (0, 1) to (1, 0)","width":"240px","align":"center","key":"iVXgtCkfCV","urlSource":"shared/npg_line.png","urlOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp"},{"type":"paragraph","position":{"start":{"line":849,"column":1},"end":{"line":851,"column":1}},"children":[{"type":"text","value":"Clearly the optimal distribution is the constant one ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"khjND6AtkL"},{"type":"inlineMath","value":"\\pi(1) = 1","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"html":"π(1)=1\\pi(1) = 1π(1)=1","key":"irpuTBWhAD"},{"type":"text","value":". Suppose we optimize over the parameterized family ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"hTW1Zksphg"},{"type":"inlineMath","value":"\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"html":"πθ(1)=exp(θ)1+exp(θ)\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}πθ(1)=1+exp(θ)exp(θ)","key":"HsNdrbFovf"},{"type":"text","value":".\nThen our optimization algorithm should set ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"VnP8CHQt9P"},{"type":"text","value":"θ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"fDnAzzjb3m"},{"type":"text","value":" to be unboundedly large.\nThen the “vanilla” gradient is","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"po1TCPCk8S"}],"key":"aMzAXw2paG"},{"type":"math","value":"\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.","position":{"start":{"line":853,"column":1},"end":{"line":853,"column":1}},"html":"θJ(πθ)=99exp(θ)(1+exp(θ))2.\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.θJ(πθ)=(1+exp(θ))299exp(θ).","enumerator":"6.38","key":"dX7hszxC1b"},{"type":"paragraph","position":{"start":{"line":855,"column":1},"end":{"line":856,"column":1}},"children":[{"type":"text","value":"Note that as ","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"KzIkNnM7gb"},{"type":"inlineMath","value":"\\theta \\to \\infty","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"html":"θ\\theta \\to \\inftyθ","key":"PWkPOeLQNA"},{"type":"text","value":" that the increments get closer and closer to ","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"B8nqQJe0Ui"},{"type":"text","value":"0","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"i2bgqfwwUe"},{"type":"text","value":";\nthe rate of increase becomes exponentially slow.","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"zKvc7aoUc1"}],"key":"qFbpjJMox7"},{"type":"paragraph","position":{"start":{"line":859,"column":1},"end":{"line":859,"column":1}},"children":[{"type":"text","value":"However, if we compute the Fisher information “matrix” (which is just a scalar in this case), we can account for the geometry induced by the parameterization.","position":{"start":{"line":859,"column":1},"end":{"line":859,"column":1}},"key":"DLtovoSHjb"}],"key":"O8lBnsSACl"},{"type":"math","value":"\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}","position":{"start":{"line":861,"column":1},"end":{"line":866,"column":1}},"html":"Fθ=Exπθ[(θlogπθ(x))2]=exp(θ)(1+exp(θ))2.\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}Fθ=Exπθ[(θlogπθ(x))2]=(1+exp(θ))2exp(θ).","enumerator":"6.39","key":"sOX8MlvcrA"},{"type":"paragraph","position":{"start":{"line":868,"column":1},"end":{"line":868,"column":1}},"children":[{"type":"text","value":"This gives the natural gradient update","position":{"start":{"line":868,"column":1},"end":{"line":868,"column":1}},"key":"WnLWjxOJBY"}],"key":"x1iLArurJW"},{"type":"math","value":"\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}","position":{"start":{"line":870,"column":1},"end":{"line":875,"column":1}},"html":"θk+1=θk+ηFθk1θJ(θk)=θk+99η\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}θk+1=θk+ηFθk1θJ(θk)=θk+99η","enumerator":"6.40","key":"V6WmcwQnoh"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"which increases at a constant rate, i.e. improves the objective more quickly than “vanilla” gradient ascent.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"YGD8qmYgKT"}],"key":"IflQ2PmCDZ"}],"enumerator":"6.1","html_id":"natural-simple","key":"yYC1Lejhbp"},{"type":"paragraph","position":{"start":{"line":880,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"Though the NPG now gives a closed-form optimization step,\nit requires computing the inverse Fisher information matrix,\nwhich typically scales as ","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"sWHTSj0rLt"},{"type":"inlineMath","value":"O((\\dim \\Theta)^3)","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"html":"O((dimΘ)3)O((\\dim \\Theta)^3)O((dimΘ)3)","key":"uNtp8X7f8N"},{"type":"text","value":".\nThis can be expensive if the parameter space is large.\nCan we find an algorithm that works in ","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"kMOroE89zf"},{"type":"emphasis","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"children":[{"type":"text","value":"linear time","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"tIs4xR8Dns"}],"key":"V0KMt58ox5"},{"type":"text","value":" with respect to the dimension of the parameter space?","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"o1dvTZeQqy"}],"key":"XdiO7ERU1l"}],"key":"TBZknQ9ah5"},{"type":"block","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":888,"column":1},"end":{"line":888,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":888,"column":1},"end":{"line":888,"column":1}},"key":"YvYQKnbSef"}],"identifier":"proximal-policy-optimization","label":"Proximal policy optimization","html_id":"proximal-policy-optimization","implicit":true,"enumerator":"6.9","key":"riuJzGXSQ1"},{"type":"paragraph","position":{"start":{"line":890,"column":1},"end":{"line":892,"column":1}},"children":[{"type":"text","value":"We can relax the TRPO optimization problem in a different way:\nRather than imposing a hard constraint on the KL distance,\nwe can instead impose a ","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"sSaZAYfDpD"},{"type":"emphasis","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"children":[{"type":"text","value":"soft","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"Rg4SnTE63Y"}],"key":"MbvYyjZaFX"},{"type":"text","value":" constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"rSBR4rUnUL"}],"key":"lByVX1NUSW"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}","position":{"start":{"line":894,"column":1},"end":{"line":898,"column":1}},"html":"θk+1argmaxθEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}θk+1argθmaxEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)","enumerator":"6.41","key":"MVlYrm0Tlh"},{"type":"paragraph","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"Here ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"IbAn5AtwOp"},{"type":"text","value":"λ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"iBvWymHTg8"},{"type":"text","value":" is a ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"fGMucq8cwT"},{"type":"strong","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"regularization hyperparameter","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"NBDhI7OlFz"}],"key":"pY5SVJvVHG"},{"type":"text","value":" that controls the tradeoff between the two terms.","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"KmkwRL3x0J"}],"key":"kMkCXnOk4I"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Like the original TRPO algorithm ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"uIhlTP81VJ"},{"type":"crossReference","kind":"proof:definition","identifier":"trpo","label":"trpo","children":[{"type":"text","value":"Definition ","key":"Qibyq3jk58"},{"type":"text","value":"6.4","key":"dIrEXZxFZ0"}],"template":"Definition %s","enumerator":"6.4","resolved":true,"html_id":"trpo","key":"PeTjzlUkGx"},{"type":"text","value":", PPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"uFgLI3o2Wm"}],"key":"ZRiGja8cYb"},{"type":"paragraph","position":{"start":{"line":904,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"How do we solve this optimization?\nLet us begin by simplifying the ","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"key":"iIZmHVRkyO"},{"type":"inlineMath","value":"\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"html":"KL(ρπkρπθ)\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}KL(ρπkρπθ)","key":"bzwKKaBwUm"},{"type":"text","value":" term. Expanding gives","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"key":"w09RF3AP3D"}],"key":"JOdDOrPl8U"},{"type":"math","value":"\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}","position":{"start":{"line":907,"column":1},"end":{"line":913,"column":1}},"html":"KL(ρπkρπθ)=Eτρπk[logρπk(τ)ρπθ(τ)]=Eτρπk[h=0H1logπk(ahsh)πθ(ahsh)]state transitions cancel=Eτρπk[h=0H1log1πθ(ahsh)]+c\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}KL(ρπkρπθ)=Eτρπk[logρπθ(τ)ρπk(τ)]=Eτρπk[h=0H1logπθ(ahsh)πk(ahsh)]=Eτρπk[h=0H1logπθ(ahsh)1]+cstate transitions cancel","enumerator":"6.42","key":"lYxfMsdYrB"},{"type":"paragraph","position":{"start":{"line":915,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"Qjd7aGDl7e"},{"type":"inlineMath","value":"c","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"html":"ccc","key":"A8zxKzatJV"},{"type":"text","value":" is some constant with respect to ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"t9qMpfa3Ix"},{"type":"text","value":"θ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"qorJH2PJ32"},{"type":"text","value":", and can be ignored.\nThis gives the objective","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"u8MeddeUx0"}],"key":"LowfNndtI7"},{"type":"math","value":"\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]","position":{"start":{"line":918,"column":1},"end":{"line":922,"column":1}},"html":"k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1log1πθ(ahsh)]\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1logπθ(ahsh)1]","enumerator":"6.43","key":"IUhnf0Bs55"},{"type":"paragraph","position":{"start":{"line":924,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"text","value":"Once again, this takes an expectation over trajectories.\nBut here we cannot directly sample trajectories from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"g7HOD3iNnz"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πk\\pi^kπk","key":"IPl2EJonzW"},{"type":"text","value":",\nsince in the first term, the actions actually come from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"W04s6VaxOr"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"M6f8vtJ7Rc"},{"type":"text","value":".\nTo make this term line up with the other expectation,\nwe would need the actions to also come from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"wD2WPtfQ6N"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πk\\pi^kπk","key":"l407iHMnER"},{"type":"text","value":".","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"xDSDdyCbDQ"}],"key":"Kjvn8yfjia"},{"type":"paragraph","position":{"start":{"line":930,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"This should sound familiar:\nwe want to estimate an expectation over one distribution by sampling from another.\nWe can once again use ","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"urrKXX669g"},{"type":"crossReference","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Section ","key":"Q1c3jRStp0"},{"type":"text","value":"6.3.3","key":"KL1cJooXo7"}],"identifier":"importance_sampling","label":"importance_sampling","kind":"heading","template":"Section %s","enumerator":"6.3.3","resolved":true,"html_id":"importance-sampling","key":"QQVbeuJ2xT"},{"type":"text","value":" to rewrite the inner expectation:","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"YlYz3o2zUx"}],"key":"F10LwDP0fy"},{"type":"math","value":"\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)","position":{"start":{"line":934,"column":1},"end":{"line":938,"column":1}},"html":"Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πθ(ahsh)πk(ahsh)Aπk(sh,ah)\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πk(ahsh)πθ(ahsh)Aπk(sh,ah)","enumerator":"6.44","key":"ey05uRKJj2"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"Now we can combine the expectations together to get the objective","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"pHqwHoft5Z"}],"key":"yTaBSWmHwT"},{"type":"math","value":"\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]","position":{"start":{"line":942,"column":1},"end":{"line":944,"column":1}},"html":"k(θ)=Eτρπk[h=0H1(πθ(ahsh)πk(ahsh)Aπk(sh,ah)λlog1πθ(ahsh))]\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]k(θ)=Eτρπk[h=0H1(πk(ahsh)πθ(ahsh)Aπk(sh,ah)λlogπθ(ahsh)1)]","enumerator":"6.45","key":"Qf9AmJjjFu"},{"type":"paragraph","position":{"start":{"line":946,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Now we can estimate this function by a sample average over trajectories from ","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"key":"OcVuI0eXDn"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"html":"πk\\pi^kπk","key":"Sm7XpJQ8ve"},{"type":"text","value":".\nRemember that to complete a single iteration of PPO,\nwe execute","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"key":"IfsKvsRIiv"}],"key":"wnluv6XeZ7"},{"type":"math","value":"\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).","position":{"start":{"line":950,"column":1},"end":{"line":952,"column":1}},"html":"θk+1argmaxθk(θ).\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).θk+1argθmaxk(θ).","enumerator":"6.46","key":"LytsiBb4Oa"},{"type":"paragraph","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"children":[{"type":"text","value":"If ","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"key":"C9GbJ1hSDR"},{"type":"inlineMath","value":"\\ell^k","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"html":"k\\ell^kk","key":"B9cltOzw8G"},{"type":"text","value":" is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"key":"lQJk5Cp5Sa"}],"key":"RrCnwJ3GgT"},{"type":"code","lang":"python","value":"def ppo_pseudocode(\n env,\n π: Callable[[Params], Callable[[State, Action], Float]],\n λ: float,\n θ_init: Params,\n n_iters: int,\n n_fit_trajectories: int,\n n_sample_trajectories: int,\n):\n θ = θ_init\n for k in range(n_iters):\n fit_trajectories = sample_trajectories(env, π(θ), n_fit_trajectories)\n A_hat = fit(fit_trajectories)\n\n sample_trajectories = sample_trajectories(env, π(θ), n_sample_trajectories)\n \n def objective(θ_opt):\n total_objective = 0\n for τ in sample_trajectories:\n for s, a, _r in τ:\n total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * jnp.log(π(θ_opt)(s, a))\n return total_objective / n_sample_trajectories\n \n θ = optimize(objective, θ)\n\n return θ","position":{"start":{"line":956,"column":1},"end":{"line":983,"column":1}},"key":"wq4ivubqW8"},{"type":"heading","depth":2,"position":{"start":{"line":985,"column":1},"end":{"line":985,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":985,"column":1},"end":{"line":985,"column":1}},"key":"Mkl9PDbN60"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"6.10","key":"KwTIkfmTtI"},{"type":"paragraph","position":{"start":{"line":987,"column":1},"end":{"line":987,"column":1}},"children":[{"type":"text","value":"Policy gradient methods are a powerful family of algorithms that directly optimize the total reward by iteratively updating the policy parameters.","position":{"start":{"line":987,"column":1},"end":{"line":987,"column":1}},"key":"fKka461pw6"}],"key":"kRhr6eRgkv"},{"type":"paragraph","position":{"start":{"line":989,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"text","value":"TODO","position":{"start":{"line":989,"column":1},"end":{"line":989,"column":1}},"key":"lsrXlDVT6Q"}],"key":"hUdruZzN2u"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":991,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":991,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Vanilla policy gradient","position":{"start":{"line":991,"column":1},"end":{"line":991,"column":1}},"key":"DlNm985kim"}],"key":"aiooq0s81O"},{"type":"listItem","spread":true,"position":{"start":{"line":992,"column":1},"end":{"line":992,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":992,"column":1},"end":{"line":992,"column":1}},"key":"o4bhnuaiRt"}],"key":"Db1IVgyDQ3"},{"type":"listItem","spread":true,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"DVZ38XIkN0"}],"key":"PKPLMgkHLZ"},{"type":"listItem","spread":true,"position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"key":"STF3aP7pu3"}],"key":"dxhfVjBhtI"},{"type":"listItem","spread":true,"position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"gil1Z8UPrj"}],"key":"ETsdz4hPk4"}],"key":"iynlH8Nrz3"}],"key":"OyrbVt6vd6"}],"key":"iFiZExDpCz"},"references":{"cite":{"order":["boyd_convex_2004"],"data":{"boyd_convex_2004":{"label":"boyd_convex_2004","enumerator":"1","html":"Boyd, S., & Vandenberghe, L. (2004). Convex Optimization. Cambridge University Press."}}}},"footer":{"navigation":{"prev":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/planning.html b/planning.html index d4985b0..a85f0cf 100644 --- a/planning.html +++ b/planning.html @@ -1,4 +1,4 @@ -8 Planning - CS/STAT 184: Introduction to Reinforcement Learning

8 Tree Search Methods

8.1Introduction

Have you ever lost a strategy game against a skilled opponent? It probably seemed like they were ahead of you at every turn. They might have been planning ahead and anticipating your actions, then planning around them in order to win. @@ -26,7 +26,7 @@ Each possible state is a node in the tree, and since we only consider deterministic games, we can represent actions as edges leading from the current state to the next. -Each path through the tree, from root to leaf, represents a single game.

The first two layers of the complete game tree of tic-tac-toe.
+Each path through the tree, from root to leaf, represents a single game.</p><figure id=The first two layers of the complete game tree of tic-tac-toe.
 From Wikimedia.

The first two layers of the complete game tree of tic-tac-toe. From Wikimedia.

If you could store the complete game tree on a computer, you would be able to win every potentially winnable game @@ -80,11 +80,11 @@ when the game’s outcome is known, and working backwards, assuming that Max chooses the action that leads to the highest score -and Min chooses the action that leads to the lowest score.

This translates directly into a recursive depth-first search algorithm for searching the game tree.

def minimax_search(s, player) -> Tuple["Action", "Value"]:
+\end{cases}

This translates directly into a recursive depth-first search algorithm for searching the complete game tree.

def minimax_search(s, player) -> Tuple["Action", "Value"]:
     """Return the value of the state (for Max) and the best action for Max to take."""
     if env.is_terminal(s):
         return None, env.winner(s)
@@ -102,7 +102,21 @@
             _, v = minimax_search(env.step(s, a), max)
             if v < v_min:
                 a_min, v_min = a, v
-        return a_min, v_min

At each of the H\hor timesteps, + return a_min, v_min

At each of the H\hor timesteps, this algorithm iterates through the entire action space at that state, and therefore has a time complexity of HnA\hor^{n_A} (where nAn_A is the largest number of actions possibly available at once). @@ -112,31 +126,51 @@ The alpha-beta search makes use of this intuition.

The intuition behind alpha-beta search is as follows: Suppose Max is in state ss, and considering whether to take action aa or aa'. -If at any point they finds out that action aa' is definitely worse than, or equal to, action aa, -they don’t need to evaluate action aa' any further. -Let us illustrate alpha-beta search with an example.

Concretely, we run min-max search as above, +If at any point they find out that action aa' is definitely worse than (or equal to) action aa, +they don’t need to evaluate action aa' any further.

Concretely, we run min-max search as above, except now we keep track of two additional parameters α(s)\alpha(s) and β(s)\beta(s) while evaluating each state. -α(s)\alpha(s) represents the highest known game score Max can achieve from state ss, -and β(s)\beta(s) represents the lowest known game score Min can achieve from state ss. -So if Max is in state ss, and evaluating a move that leads to state ss', -and they find that state ss' has some value greater than β(s)\beta(s), -they can stop evaluating, -since they know Min would not choose an action that enters state ss.

def alpha_beta_search(s, player, alpha, beta) -> Tuple["Action", "Value"]:
+Suppose we are evaluating Vh(s)V^\star_\hi(s),
+where it is Max’s turn (h\hi is even).
+We update α(s)\alpha(s) to be the highest value achievable from ss so far.
+That is, the value of ss is at least α(s)\alpha(s).
+Suppose Max chooses action aa, which leads to state ss', in which it is Min’s turn.
+If any of Min’s actions in ss' achieve a value Vh+1(s)α(s)V^\star_{\hi+1}(s') \le \alpha(s),
+we know that Max would not choose action aa,
+since they know that it is worse than whichever action gave the value α(s)\alpha(s).
+Similarly, to evaluate a state on Min’s turn,
+we update β(s)\beta(s) to be the lowest value achievable from ss so far.
+That is, the value of ss is at most β(s)\beta(s).
+Suppose Min chooses action aa,
+which leads to state ss' for Max.
+If Max has any actions that do better than β(s)\beta(s),
+they would take it,
+making action aa a suboptimal choice for Min.

def alpha_beta_search(s, player, alpha, beta) -> Tuple["Action", "Value"]:
     """Return the value of the state (for Max) and the best action for Max to take."""
     if env.is_terminal(s):
         return None, env.winner(s)
@@ -177,7 +211,7 @@
 we call it a heuristic.

Can we develop heuristic methods for tree exploration that works for all sorts of games?

The task of evaluating actions in a complex environment might seem familiar. We’ve encountered this problem before in both the multi-armed bandits setting and the Markov decision process setting. Now we’ll see how to combine concepts from these to form a more general and efficient tree search heuristic called Monte Carlo Tree Search (MCTS).

When a problem is intractable to solve exactly, -we often turn to approximate or randomized algorithms that sacrifice some accuracy in exchange for computational efficiency. +we often turn to approximate algorithms that sacrifice some accuracy in exchange for computational efficiency. MCTS also improves on alpha-beta search in this sense. As the name suggests, MCTS uses Monte Carlo simulation, that is, collecting random samples and computing the sample statistics, @@ -196,8 +230,8 @@ where each action corresponds to an arm, and the reward distribution of arm kk is the distribution of the game score over random games after choosing that arm. The most commonly used bandit algorithm in practice for MCTS is the Upper Confidence Bound (UCB) algorithm.

  • Append (s,a)(s, a) to τ
  • Set sP(s,a)s \gets P(s, a)
  • Expansion: Let snews_\text{new} denote the final state in τ (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from snews_\text{new}. Call it anewa_{\text{new}}. Add it to τ.
  • Simulation: Simulate a complete game episode starting with the action anewa_{\text{new}} +h400000v40h-400000z'/>
  • Append (s,a)(s, a) to τ
  • Set sP(s,a)s \gets P(s, a)
  • Expansion: Let snews_\text{new} denote the final state in τ (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from snews_\text{new}. Call it anewa_{\text{new}}. Add it to τ.
  • Simulation: Simulate a complete game episode by starting with the action anewa_{\text{new}} and then playing according to πrollout\pi_\text{rollout}. This results in the outcome r{+1,1}r \in \{ +1, -1 \}.
  • Backup: For each (s,a)τ(s, a) \in \tau:
    • Set Ns,aNs,a+1N^{s, a} \gets N^{s, a} + 1
    • Ws,aWs,a+rW^{s, a} \gets W^{s, a} + r
    • Set NsNs+1N^s \gets N^s + 1
  • After TT repeats of the above, we return the action with the highest UCB value (8.4). Then play continues.

    Between turns, we can keep the subtree whose statistics we have visited so far. However, the rest of the tree for the actions we did not end up taking gets discarded.

    The application which brought the MCTS algorithm to fame was DeepMind’s AlphaGo Silver et al. (2016). Since then, it has been used in numerous applications ranging from games to automated theorem proving.

    How accurate is this Monte Carlo estimation? -It might depend heavily on the rollout policy πrollout\pi_\text{rollout}. -If the distribution it induces over games is very different from the distribution seen during real gameplay, -we might end up with a poor approximation to the actual value of a state.

    8.5.1Value approximation

    To remedy this, +It depends heavily on the rollout policy πrollout\pi_\text{rollout}. +If the distribution πrollout\pi_\text{rollout} induces over games is very different from the distribution seen during real gameplay, +we might end up with a poor value approximation.

    8.5.1Incorporating value functions and policies

    To remedy this, we might make use of a value function v:SRv : \mathcal{S} \to \mathbb{R} that more efficiently approximates the value of a state. -Then, we can replace the simulation step of MCTS with evaluating r=v(P(snew,anew))r = v(P(s_\text{new}, a_\text{new})).

    We might also make use of a policy function π:S(A)\pi : \mathcal{S} \to \triangle(\mathcal{A}) that provides “intuition” as to which actions are more valuable in a given state. -We can scale the “exploration” term of (8.4) according to the policy function’s outputs.

    Putting these together, -we can describe an updated version of MCTS that makes use of these value and policy functions:

    How do we actually compute a useful π and vv? +h400000v40h-400000z'/>

  • Append (s,a)(s, a) to τ
  • Set sP(s,a)s \gets P(s, a)
  • Expansion: Let snews_\text{new} denote the final state in τ (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from snews_\text{new}. Call it anewa_{\text{new}}. Add it to τ.
  • Simulation: Let snext=P(snew,anew)s_\text{next} = P(s_\text{new}, a_\text{new}). Evaluate r=v(snext)r = v(s_\text{next}). This approximates the value of the game after taking the action anewa_\text{new}.
  • Backup: For each (s,a)τ(s, a) \in \tau:
    • Ns,aNs,a+1N^{s, a} \gets N^{s, a} + 1
    • Ws,aWs,a+rW^{s, a} \gets W^{s, a} + r
    • NsNs+1N^s \gets N^s + 1
  • We finally return the action with the highest UCB value (8.5). +Then play continues. As before, we can reuse the tree across timesteps.

    How do we actually compute a useful πguide\pi_\text{guide} and vv? If we have some existing dataset of trajectories, we could use supervised learning (that is, imitation learning) -to generate a policy π via behavioral cloning +to generate a policy πguide\pi_\text{guide} via behavioral cloning and learn vv by regressing the game outcomes onto states. Then, plugging these into the above algorithm results in a stronger policy by using tree search to “think ahead”.

    But we don’t have to stop at just one improvement step; -we could iterate this process via self-play.

    8.5.2Self-play

    Recall the policy iteration algorithm from the MDPs chapter. -Policy iteration alternates between policy evaluation (taking \pI and computing VπV^\pi) +we could iterate this process via self-play.

    8.5.2Self-play

    Recall the policy iteration algorithm from the MDPs chapter. +Policy iteration alternates between policy evaluation (taking π and computing VπV^\pi) and policy improvement (setting π to be greedy with respect to VπV^\pi). Above, we saw how MCTS can be thought of as a “policy improvement” operation: for a given policy π0\pi^0, -we can use it to influence MCTS. -The resulting algorithm is itself a policy πMCTS0\pi^0_\text{MCTS} that maps from states to actions. +we can use it to guide MCTS, +resulting in an algorithm that is itself a policy πMCTS0\pi^0_\text{MCTS} that maps from states to actions. Now, we can use behavioral cloning to obtain a new policy π1\pi^1 that imitates πMCTS0\pi^0_\text{MCTS}. -We can now use π1\pi^1 to influence MCTS, -and repeat.

    8.6References

    Chapter 5 of Russell & Norvig (2021) provides an excellent overview of search methods in games.

    References
    1. Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., van den Driessche, G., Schrittwieser, J., Antonoglou, I., Panneershelvam, V., Lanctot, M., Dieleman, S., Grewe, D., Nham, J., Kalchbrenner, N., Sutskever, I., Lillicrap, T., Leach, M., Kavukcuoglu, K., Graepel, T., & Hassabis, D. (2016). Mastering the Game of Go with Deep Neural Networks and Tree Search. Nature, 529(7587), 484–489. 10.1038/nature16961
    2. Russell, S. J., & Norvig, P. (2021). Artificial Intelligence: A Modern Approach (Fourth edition). Pearson.
    \ No newline at end of file diff --git a/planning.json b/planning.json index 65a2749..567da99 100644 --- a/planning.json +++ b/planning.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"c113fbe15387b85f550dd4e8af4b8e18e219b186b8c52024e2c6983dc8a6b3d8","slug":"planning","location":"/planning.md","dependencies":[],"frontmatter":{"title":"8 Planning","numbering":{"all":{"enabled":true},"enumerator":{"template":"8.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","thumbnailOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp","exports":[{"format":"md","filename":"planning.md","url":"/build/planning-3f770aaa8cb40e50ea7be60b3afba8d9.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"rHRgR8dsth"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"8.1","key":"EmAlXNdRKa"},{"type":"paragraph","position":{"start":{"line":22,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Have you ever lost a strategy game against a skilled opponent?\nIt probably seemed like they were ahead of you at every turn.\nThey might have been ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"HoyeZgohYR"},{"type":"emphasis","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"planning ahead","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"k8Iau8oNvf"}],"key":"OXJPCxgMSE"},{"type":"text","value":" and anticipating your actions,\nthen planning around them in order to win.\nIf this opponent was a computer,\nthey might have been using one of the strategies that we are about to explore.","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"LgDBlPuOrH"}],"key":"Ns6C6tRvTQ"},{"type":"heading","depth":2,"position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Deterministic, zero sum, fully observable two-player games","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"FXG15cECw4"}],"identifier":"deterministic-zero-sum-fully-observable-two-player-games","label":"Deterministic, zero sum, fully observable two-player games","html_id":"deterministic-zero-sum-fully-observable-two-player-games","implicit":true,"enumerator":"8.2","key":"YuO2fTNHAN"},{"type":"paragraph","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"In this chapter, we will focus on games that are:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"DrDZT7OXB9"}],"key":"l1ERHI1jJx"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":33,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"deterministic,","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"jrxE8lIZI8"}],"key":"vMhTtBwqtM"}],"key":"jiDpQp3Qr9"},{"type":"listItem","spread":true,"position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"zero sum","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"kW9wh4sP5N"}],"key":"EC7DqfPbaz"},{"type":"text","value":" (one player wins and the other loses),","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"tgvRZGLrxb"}],"key":"jmrYcmCOdC"},{"type":"listItem","spread":true,"position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"fully observable,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"I1fPkOyfD8"}],"key":"kKpylbbjbe"},{"type":"text","value":" that is, the state of the game is perfectly known by both players,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"WsJSqav2UD"}],"key":"kMNzgT1WOt"},{"type":"listItem","spread":true,"position":{"start":{"line":36,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"for ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"ayXgMSrSMl"},{"type":"emphasis","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"two players","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"GAAqUey2Ud"}],"key":"GEZ1pWAIPn"},{"type":"text","value":" that alternate turns,","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"Aqtf9ozBLK"}],"key":"NzbYJ4mzr5"}],"key":"WDRsCoOdII"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"We can represent such a game as a ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"ON7KDD8ESR"},{"type":"emphasis","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"complete game tree.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"dwWWcoasy1"}],"key":"BTfaWSaxDr"},{"type":"text","value":"\nEach possible state is a node in the tree,\nand since we only consider deterministic games,\nwe can represent actions as edges leading from the current state to the next.\nEach path through the tree, from root to leaf, represents a single game.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"n0Hfsfr7Ih"}],"key":"JzWQ6IsKx5"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","alt":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"IlYy7IW4XR","urlSource":"shared/tic_tac_toe.png","urlOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"BA4w2XIR2j"}],"key":"cubOGzHvZN"}],"key":"Y7Yp5P53n4"}],"enumerator":"8.1","key":"gf12kYcR28"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"If you could store the complete game tree on a computer,\nyou would be able to win every potentially winnable game\nby searching all paths from your current state and taking a winning move.\nWe will see an explicit algorithm for this in ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"njC0HmhjFt"},{"type":"crossReference","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"the next section","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"oV5LWpANwP"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"jhj2wZTneL"},{"type":"text","value":".\nHowever, as games become more complex,\nit becomes computationally impossible to search every possible path.","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"kwFQ9NXkeR"}],"key":"wrlS5xTzk8"},{"type":"paragraph","position":{"start":{"line":58,"column":1},"end":{"line":66,"column":1}},"children":[{"type":"text","value":"For instance,\na chess player has roughly 30 actions to choose from at each turn,\nand each game takes roughly 40 moves per player,\nso trying to solve chess exactly using minimax\nwould take somewhere on the order of ","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"TiwqwnGDFE"},{"type":"inlineMath","value":"30^{80} \\approx 10^{118}","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"html":"30801011830^{80} \\approx 10^{118}308010118","key":"La9IVxsn6E"},{"type":"text","value":" operations.\nThat’s 10 billion billion billion billion billion billion billion billion billion billion billion billion billion operations.\nAs of the time of writing,\nthe fastest processor can achieve almost 10 GHz (10 billion operations per second),\nso to fully solve chess using minimax is many, many orders of magnitude out of reach.","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"coERh8pBRQ"}],"key":"mGBN7Wkenw"},{"type":"paragraph","position":{"start":{"line":68,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"It is thus intractable, in any realistic setting, to solve the complete game tree exactly.\nLuckily, only a small fraction of those games ever occur in reality;\nLater in this chapter,\nwe will explore ways to ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"wQ0yoYHbgG"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"prune away","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"hrbveiPA9T"}],"key":"sfsAQhQjLh"},{"type":"text","value":" parts of the tree that we know we can safely ignore.\nWe can also ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"Jh5eby52MG"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"uodqe5RLBV"}],"key":"oEqoKPMzWZ"},{"type":"text","value":" the value of a state without fully evaluating it.\nUsing these approximations, we can no longer ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"NA9BlLW4dX"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"guarantee","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"xPsqBNIHJW"}],"key":"eKiWOqTZmV"},{"type":"text","value":" winning the game,\nbut we can come up with strategies that will do well against most opponents.","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"Cif98KhV8b"}],"key":"aawtZgY3rB"},{"type":"heading","depth":3,"position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"KI9b1Fy7BL"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"8.2.1","key":"qjOt6mMJbY"},{"type":"paragraph","position":{"start":{"line":78,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Let us now describe these games formally.\nWe’ll call the first player Max and the second player Min.\nMax seeks to maximize the final game score,\nwhile Min seeks to minimize the final game score.","position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"key":"j2n1rz3ufA"}],"key":"TLbrOp3SwW"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":83,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"We’ll use ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"aLdr5l9A5Y"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"lTPYYgkYYL"},{"type":"text","value":" to denote the set of all possible game states.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"fFWTu83SqZ"}],"key":"FxhLyIuJ9p"},{"type":"listItem","spread":true,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"The game begins in some ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"VJAQOiu51B"},{"type":"strong","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"initial state","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"UJspiXrDCi"}],"key":"QuYCItqgKe"},{"type":"text","value":" ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"JmfjHCVOZb"},{"type":"inlineMath","value":"s_0 \\in \\mathcal{S}","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"html":"s0Ss_0 \\in \\mathcal{S}s0S","key":"BwnaZBWPH1"},{"type":"text","value":".","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"rb8MmgwbJ8"}],"key":"LNhl4J5QU6"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Max moves on even turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"DIFQ88D8px"},{"type":"inlineMath","value":"h = 2n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2nh = 2nh=2n","key":"V4ilgM6Kq4"},{"type":"text","value":",\nand Min moves on odd turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"HSyRPxQaSc"},{"type":"inlineMath","value":"h = 2n+1","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2n+1h = 2n+1h=2n+1","key":"s44MNcKaVI"},{"type":"text","value":",\nwhere ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"eQekkMJRQ6"},{"type":"inlineMath","value":"n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"nnn","key":"wrFERPgmiC"},{"type":"text","value":" is a natural number.","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"IApXAOGZ6l"}],"key":"JuaX2F9JMd"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"The space of possible actions, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"VofDVjhpTD"},{"type":"inlineMath","value":"\\mathcal{A}_h(s)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"Ah(s)\\mathcal{A}_h(s)Ah(s)","key":"v9ylj0KF3S"},{"type":"text","value":",\ndepends on the state itself, as well as whose turn it is.\n(For example, in tic-tac-toe, Max can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"k1bfGOS3ZI"},{"type":"inlineCode","value":"X","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"dItXZliM3Y"},{"type":"text","value":"s while Min can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"tmRTJUPeww"},{"type":"inlineCode","value":"O","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"v0XIreazUi"},{"type":"text","value":"s.)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"G2F03VjJRz"}],"key":"N8l71K9yr8"},{"type":"listItem","spread":true,"position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"The game ends after ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"MLLJLngUmX"},{"type":"inlineMath","value":"H","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"html":"HHH","key":"tYAm7NsCJ5"},{"type":"text","value":" total moves (which might be even or odd). We call the final state a ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"B8bKSZc2i3"},{"type":"strong","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"terminal state","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"lTRRUseXW8"}],"key":"KbuXBaaxXl"},{"type":"text","value":".","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"G8rlTgiUIr"}],"key":"bsq3t21QZd"},{"type":"listItem","spread":true,"position":{"start":{"line":92,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"inlineMath","value":"P","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"PPP","key":"TKSq9ZmKms"},{"type":"text","value":" denotes the ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"W4XIxJJetA"},{"type":"strong","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"JvoCBnERaR"}],"key":"joqKQZOTNY"},{"type":"text","value":", that is,\n","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"g1kYed4oRr"},{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"STzzl0OSOt"},{"type":"text","value":" denotes the resulting state when taking action ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"eMvOsIy2WR"},{"type":"inlineMath","value":"a \\in \\mathcal{A}(s)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"aA(s)a \\in \\mathcal{A}(s)aA(s)","key":"E0wwrTjskQ"},{"type":"text","value":" in state ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"HEJCAJ1YlK"},{"type":"inlineMath","value":"s","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"sss","key":"HRIE431b9Q"},{"type":"text","value":".","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"sjPe0JHg34"}],"key":"Mo0nODTtLH"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r(s)r(s)r(s)","key":"POmtSehPvk"},{"type":"text","value":" denotes the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"iq7CJ09lP1"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"game score","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"RAFnqxsl1O"}],"key":"pPTIXmAkXn"},{"type":"text","value":" of the terminal state ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"m92N731Am0"},{"type":"inlineMath","value":"s","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"sss","key":"SLWofFDPbU"},{"type":"text","value":".\nNote that this is some positive or negative value seen by both players:\nA positive value indicates Max winning, a negative value indicates Min winning, and a value of ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"yUcBVIxCit"},{"type":"text","value":"0","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"YycE3po6w7"},{"type":"text","value":" indicates a tie.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"pJD30NAn9P"}],"key":"fJL0b6Ml7u"}],"key":"YoSXLW004d"},{"type":"paragraph","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"We also call the sequence of states and actions a ","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"RLXZKU4gJ1"},{"type":"strong","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"rgjztoJHcR"}],"key":"vRTTm4zKpS"},{"type":"text","value":".","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"nax0qLy3dQ"}],"key":"tyLej0iVFH"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"zrwKJcEqmr"}],"key":"v4rlSJ8EbX"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Above, we suppose that the game ends after ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"CXD2QmRGyT"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"OhNjlyTEGA"},{"type":"text","value":" total moves.\nBut most real games have a ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"PxkZfxksek"},{"type":"emphasis","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"variable","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"O7BG23OlhO"}],"key":"acYJsZ3X2z"},{"type":"text","value":" length.\nHow would you describe this?","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"Yv0D3xJd3f"}],"key":"WdeFqw5e8n"}],"key":"XXjGwjJKiK"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Let us frame tic-tac-toe in this setting.","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"KrzDED7miQ"}],"key":"Wjx6kPALEY"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":108,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"Each of the ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"xgAU2Civab"},{"type":"text","value":"9","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"tVsUtQNE7T"},{"type":"text","value":" squares is either empty, marked X, or marked O.\nSo there are ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"Kv1Ick7IBQ"},{"type":"inlineMath","value":"|\\mathcal{S}| = 3^9","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"html":"S=39|\\mathcal{S}| = 3^9S=39","key":"ehzBITxFMa"},{"type":"text","value":" potential states.\nNot all of these may be reachable!","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"HYxatPPIwM"}],"key":"hzf9uZAWtF"},{"type":"listItem","spread":true,"position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"children":[{"type":"text","value":"The initial state ","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"XlEFvEZccw"},{"type":"inlineMath","value":"s_0","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"html":"s0s_0s0","key":"dn80s2DnzG"},{"type":"text","value":" is the empty board.","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"R2krjNfpdm"}],"key":"aDlO4YX5yi"},{"type":"listItem","spread":true,"position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"The set of possible actions for Max in state ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"WLZ9aQg2YP"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"RQShxOugWd"},{"type":"text","value":", ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"FFpHWxfYqH"},{"type":"inlineMath","value":"\\mathcal{A}_{2n}(s)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"A2n(s)\\mathcal{A}_{2n}(s)A2n(s)","key":"DNs7qRAC4o"},{"type":"text","value":", is the set of tuples ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"xphupeIhrY"},{"type":"inlineMath","value":"(\\text{``X''}, i)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"(“X”,i)(\\text{``X''}, i)(“X”,i)","key":"JcBAoJUhzV"},{"type":"text","value":" where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"fzMu6CviqT"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"WqAAibB5YN"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"itU4TC1S6l"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"VOlh22U3Or"},{"type":"text","value":".\nSimilarly, ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"oCnxoMYwWu"},{"type":"inlineMath","value":"\\mathcal{A}_{2n+1}(s)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"A2n+1(s)\\mathcal{A}_{2n+1}(s)A2n+1(s)","key":"pptksfrIIW"},{"type":"text","value":" is the set of tuples ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"T23Fq0BNtz"},{"type":"inlineMath","value":"(\\text{``O''}, i)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"(“O”,i)(\\text{``O''}, i)(“O”,i)","key":"Bh0FSFgEe3"},{"type":"text","value":" where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"PqTC5j2QdK"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"UracxnCI2r"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"hX5ySlVGx9"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"buwPTcRDpo"},{"type":"text","value":".","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"HndgMiU53d"}],"key":"SkSNtdtevZ"},{"type":"listItem","spread":true,"position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"We can take ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"o6Kw2OLC7a"},{"type":"inlineMath","value":"H = 9","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"H=9H = 9H=9","key":"eTjUeDj6bS"},{"type":"text","value":" as the longest possible game length.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"Qi8q445nKh"}],"key":"FPAcLXBodz"},{"type":"listItem","spread":true,"position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"tro6rDMnj9"},{"type":"text","value":" for a ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"sUkRuy92gI"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"EVhkiepdhe"}],"key":"NNnVZbI93z"},{"type":"text","value":" state ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"BePcr2gv7b"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"Fg2HXfraLP"},{"type":"text","value":" is simply the board with the symbol and square specified by ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"A3ar2RV1qb"},{"type":"inlineMath","value":"a","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"aaa","key":"uJHxA4gMN4"},{"type":"text","value":" marked into ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"fy7ops6VIY"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"eNFHeDdzGI"},{"type":"text","value":". Otherwise, if ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"OhAyGlO17T"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"tNkFAVMpOB"},{"type":"text","value":" is a ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"lm0b03ocGR"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"v6RHf9oSAD"}],"key":"rp3WUrS8kj"},{"type":"text","value":" state, i.e. it already has three symbols in a row, the state no longer changes.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"nkCWXKNF3L"}],"key":"nSk27hS12q"},{"type":"listItem","spread":true,"position":{"start":{"line":116,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"r(s)r(s)r(s)","key":"B34nwZqXcc"},{"type":"text","value":" at a ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"px4VLIPTac"},{"type":"emphasis","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"LT3b1rHJmz"}],"key":"DqofZXenNU"},{"type":"text","value":" state is ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"azA5xQypTl"},{"type":"text","value":"+1","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"zwfBZolLkK"},{"type":"text","value":" if there are three Xs in a row, ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"VIuIugQvsY"},{"type":"text","value":"-1","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"dMEZVrLvst"},{"type":"text","value":" if there are three Os in a row, and ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"sKIzduGe3H"},{"type":"text","value":"0","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"NwklYnQwBZ"},{"type":"text","value":" otherwise.","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"IbteRS2DFM"}],"key":"CWYO3HCx2F"}],"key":"DlDP2eL5hK"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Our notation may remind you of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"jtyVGqiPKr"},{"type":"link","url":"/mdps","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"ySJiZngoxq"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"IG0IeTcjsq"},{"type":"text","value":".\nGiven that these games also involve a sequence of states and actions,\ncan we formulate them as finite-horizon MDPs?\nThe two settings are not exactly analogous,\nsince in MDPs we only consider a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"kN8ZamB216"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"single","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"CeKgZE6pzj"}],"key":"V66YlpUdIg"},{"type":"text","value":" policy,\nwhile these games involve two distinct players with opposite objectives.\nSince we want to analyze the behavior of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"qQpVy2aab4"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"both","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"XlgY2GTyxG"}],"key":"jpmaWMf8eS"},{"type":"text","value":" players at the same time,\ndescribing such a game as an MDP is more trouble than it’s worth.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"pveI6qeynn"}],"key":"NKPCAN3YQw"},{"type":"heading","depth":2,"position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"children":[{"type":"text","value":"Min-max search *","position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"key":"T1ZcY1Xwo4"}],"label":"min-max-search","identifier":"min-max-search","html_id":"min-max-search","enumerator":"8.3","key":"dvvz2AecWp"},{"type":"admonition","kind":"important","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Important","key":"UyGJ5wHHYx"}],"key":"hY5Mrdw12E"},{"type":"paragraph","position":{"start":{"line":131,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"The course (Fall 2024) does not cover min-max search.\nThis content is here to provide background on ","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"ZIC6Ddo9ZH"},{"type":"emphasis","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"text","value":"optimally","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"V1lKP02snC"}],"key":"fDXbP2klLp"},{"type":"text","value":" solving these deterministic, zero-sum, two-player games.","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"KXYqJizJDK"}],"key":"vriezwgG0b"}],"key":"HlGxgliIZj"},{"type":"paragraph","position":{"start":{"line":135,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"In the introduction,\nwe claimed that we could win any potentially winnable game by looking ahead and predicting the opponent’s actions.\nThis would mean that each ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"oeHiL7xLN5"},{"type":"emphasis","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"QoibuQJxpD"}],"key":"OhDdlqw3dC"},{"type":"text","value":" state already has some predetermined game score,\nthat is, in each state,\nit is already “obvious” which player is going to win.\nLet ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"cQUeIaVR2a"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"fciaUtHjQt"},{"type":"text","value":" denote the game score under optimal play starting in state ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"H5WCyqBYfy"},{"type":"inlineMath","value":"s","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"sss","key":"ZgFSLQ1V2G"},{"type":"text","value":" at time ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"r82oNvCPPX"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"h\\hih","key":"PR61VelLay"},{"type":"text","value":".\nWe can compute this by starting at the terminal states,\nwhen the game’s outcome is known,\nand working backwards,\nassuming that Max chooses the action that leads to the highest score\nand Min chooses the action that leads to the lowest score.","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"mLPNhXBnQq"}],"key":"iIYtBCRvUZ"},{"type":"proof","kind":"algorithm","label":"min-max-value","identifier":"min-max-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search algorithm","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"mbx1KoEvww"}],"key":"wOIWEGVszx"},{"type":"math","value":"V_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is even and } h < H \\\\\n\\min_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is odd and } h < H \\\\\n\\end{cases}","position":{"start":{"line":150,"column":1},"end":{"line":156,"column":1}},"html":"Vh(s)={r(s)h=HmaxaA(s)Vh+1(P(s,a))h is even and h<HminaA(s)Vh+1(P(s,a))h is odd and h<HV_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is even and } h < H \\\\\n\\min_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is odd and } h < H \\\\\n\\end{cases}Vh(s)=r(s)maxaA(s)Vh+1(P(s,a))minaA(s)Vh+1(P(s,a))h=Hh is even and h<Hh is odd and h<H","enumerator":"8.1","key":"VIEpV5QTQx"}],"enumerator":"8.1","html_id":"min-max-value","key":"mXJN9n8FRk"},{"type":"paragraph","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"This translates directly into a recursive depth-first search algorithm for searching the game tree.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"M4PV3QRbLG"}],"key":"XdJOmjLSwq"},{"type":"code","lang":"python","value":"def minimax_search(s, player) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min)\n if v > v_max:\n a_max, v_max = a, v\n return a_max, v_max\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n return a_min, v_min","position":{"start":{"line":161,"column":1},"end":{"line":181,"column":1}},"key":"KF6QCHqIDU"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"DHlDNQCF78"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"H\\horH","key":"V3YByfxvlC"},{"type":"text","value":" timesteps,\nthis algorithm iterates through the entire action space at that state,\nand therefore has a time complexity of ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"NDiRCIqOoU"},{"type":"inlineMath","value":"\\hor^{n_A}","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"HnA\\hor^{n_A}HnA","key":"uqzPEZjfFi"},{"type":"text","value":"\n(where ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"RlOsB0bvxo"},{"type":"inlineMath","value":"n_A","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"nAn_AnA","key":"XmENWpaC1s"},{"type":"text","value":" is the largest number of actions possibly available at once).\nThis makes the min-max algorithm impractical for even moderately sized games.","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"iTyYoB3gSX"}],"key":"lMGGPqAOzT"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"But do we need to compute the exact value of ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"ZE2Eh9BFV7"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"every","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"dFB1THsaNl"}],"key":"wS6oFqOzQO"},{"type":"text","value":" possible state?\nInstead, is there some way we could “ignore” certain actions and their subtrees\nif we already know of better options?\nThe ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"k3xCceISFL"},{"type":"strong","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"alpha-beta search","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"y5eAs42yvu"}],"key":"pSqLgxjhXO"},{"type":"text","value":" makes use of this intuition.","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"Law0V0jxxq"}],"key":"RTCzEaFQo0"},{"type":"heading","depth":2,"position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"WTiUfTxDtF"}],"identifier":"alpha-beta-search","label":"Alpha-beta search","html_id":"alpha-beta-search","implicit":true,"enumerator":"8.4","key":"zgBsdTAd50"},{"type":"paragraph","position":{"start":{"line":196,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"The intuition behind alpha-beta search is as follows:\nSuppose Max is in state ","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"iiGc52lqhm"},{"type":"inlineMath","value":"s","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"html":"sss","key":"nwSpDc7U1f"},{"type":"text","value":",\nand considering whether to take action ","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"L21VkI6wp2"},{"type":"inlineMath","value":"a","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"html":"aaa","key":"h5Jbm8ygcZ"},{"type":"text","value":" or ","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"GNcot1v0wa"},{"type":"inlineMath","value":"a'","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"html":"aa'a","key":"BiKUJRQCQq"},{"type":"text","value":".\nIf at any point they finds out that action ","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"hLosE7LBTH"},{"type":"inlineMath","value":"a'","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"html":"aa'a","key":"WZYBsPTM59"},{"type":"text","value":" is definitely worse than, or equal to, action ","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"pcVTedC8DH"},{"type":"inlineMath","value":"a","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"html":"aaa","key":"fnqVGSHqGh"},{"type":"text","value":",\nthey don’t need to evaluate action ","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"MfJRUrzhCL"},{"type":"inlineMath","value":"a'","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"html":"aa'a","key":"YAyAI6l0Qz"},{"type":"text","value":" any further.\nLet us illustrate alpha-beta search with an example.","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"no3tflVsSJ"}],"key":"FfQ8am8wzf"},{"type":"paragraph","position":{"start":{"line":203,"column":1},"end":{"line":210,"column":1}},"children":[{"type":"text","value":"Concretely, we run min-max search as above,\nexcept now we keep track of two additional parameters ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"XPHEkS61tR"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"qvsMnCCB9b"},{"type":"text","value":" and ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"htdbgsktsr"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"e5uKBzw63n"},{"type":"text","value":" while evaluating each state.\n","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"qYcq4hhmhX"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"MYsoMYrebR"},{"type":"text","value":" represents the ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"lYXCfa3JTy"},{"type":"emphasis","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"children":[{"type":"text","value":"highest","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"MGgyW8s5vf"}],"key":"KuI5dnB3wJ"},{"type":"text","value":" known game score Max can achieve from state ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"kl2Fvr49y7"},{"type":"inlineMath","value":"s","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"sss","key":"AIKMqdNo7a"},{"type":"text","value":",\nand ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"EJ0rcLu4LT"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"gzjqadfGbT"},{"type":"text","value":" represents the ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"znmBE7Xht3"},{"type":"emphasis","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"children":[{"type":"text","value":"lowest","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"p2MQZVMoVH"}],"key":"fAD3mg9B6k"},{"type":"text","value":" known game score Min can achieve from state ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"t4v3TYWoSw"},{"type":"inlineMath","value":"s","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"sss","key":"fF30QGNfum"},{"type":"text","value":".\nSo if Max is in state ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"ZXEMvI9uTU"},{"type":"inlineMath","value":"s","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"sss","key":"HtKS2d8X2z"},{"type":"text","value":", and evaluating a move that leads to state ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"NJtk4V3Vcx"},{"type":"inlineMath","value":"s'","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"ss's","key":"GesveXypxo"},{"type":"text","value":",\nand they find that state ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"ND1wwzqOPP"},{"type":"inlineMath","value":"s'","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"ss's","key":"HMCBBUuyxz"},{"type":"text","value":" has some value ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"cD4CwtPOmp"},{"type":"emphasis","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"children":[{"type":"text","value":"greater","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"W6hG5O8JZ4"}],"key":"BNMQrDZRls"},{"type":"text","value":" than ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"XA1DDIOeCr"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"b0dyUInxDW"},{"type":"text","value":",\nthey can stop evaluating,\nsince they know Min would not choose an action that enters state ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"mlQ1L56Ejb"},{"type":"inlineMath","value":"s","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"sss","key":"pGoFa7yZ27"},{"type":"text","value":".","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"Lp3nihuGvH"}],"key":"JpdjqcUGEZ"},{"type":"proof","kind":"example","label":"alpha-beta-example","identifier":"alpha-beta-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Alpha-beta search for a simple game","position":{"start":{"line":212,"column":1},"end":{"line":212,"column":1}},"key":"YdkuoQ3zdL"}],"key":"i77VtL0Hk8"},{"type":"paragraph","position":{"start":{"line":215,"column":1},"end":{"line":220,"column":1}},"children":[{"type":"text","value":"Consider a simple game that consists of just one move by Max and one move by Min. Each player has three available actions. Each pair of moves leads to a different integer outcome.\nMax tries to find the optimal action using a depth-first search.\nThey imagine taking the first action,\nand then imagine each of the actions that Min could take.\nThey know that Min will choose whichever option minimizes Max’s score.\nThus the value of taking the first action is updated exactly:","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"ez7JPRhtvD"}],"key":"WmrFIWrVDb"},{"type":"paragraph","position":{"start":{"line":222,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-0-86df61f03eb0632eea80f3163fe9a594.png","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"IAWGANFwDU","urlSource":"./shared/alpha-beta-0.png","urlOptimized":"/build/alpha-beta-0-86df61f03eb0632eea80f3163fe9a594.webp"},{"type":"text","value":"\n","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"jxUv7fBhCM"},{"type":"image","url":"/build/alpha-beta-1-43143fa8bfb0c172dabc44ea3c5e83c7.png","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"vdZJhm8cUb","urlSource":"./shared/alpha-beta-1.png","urlOptimized":"/build/alpha-beta-1-43143fa8bfb0c172dabc44ea3c5e83c7.webp"},{"type":"text","value":"\n","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"GTvBps6zKn"},{"type":"image","url":"/build/alpha-beta-2-516bc4f7e36c5bee12bb1f5e38728fa7.png","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"ORPddpt7BW","urlSource":"./shared/alpha-beta-2.png","urlOptimized":"/build/alpha-beta-2-516bc4f7e36c5bee12bb1f5e38728fa7.webp"}],"key":"waaWUDcI5H"},{"type":"paragraph","position":{"start":{"line":226,"column":1},"end":{"line":229,"column":1}},"children":[{"type":"text","value":"Then Max imagines taking the second action.\nOnce again, they imagine each of the actions that Min could take,\nin order.\nThey find that the first of Min’s actions in this state leads to a ","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"eQDC6fcqhA"},{"type":"emphasis","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"worse","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"uWeCrUx1Mn"}],"key":"TBNI3T19cM"},{"type":"text","value":" outcome (for Max):","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"FUdvGhUCTQ"}],"key":"Xulsq9BaWf"},{"type":"image","url":"/build/alpha-beta-3-c9d8a3e5cb26e00825b3bad6a297b5b8.png","position":{"start":{"line":231,"column":1},"end":{"line":231,"column":1}},"key":"dlBoixplnS","urlSource":"./shared/alpha-beta-3.png","urlOptimized":"/build/alpha-beta-3-c9d8a3e5cb26e00825b3bad6a297b5b8.webp"},{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"text","value":"Now Max doesn’t need to explore Min’s other actions;\nthey know that taking the second action will lead to an outcome at least as bad as the first outcome above,\nso they would always prefer taking action one instead of action two.\nSo Max moves on to considering the third action:","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"WgLeGHzpWV"}],"key":"OhX0Lquw01"},{"type":"image","url":"/build/alpha-beta-4-49895f9c64529305e635210b8a678694.png","position":{"start":{"line":238,"column":1},"end":{"line":238,"column":1}},"key":"XrP4q3t7pR","urlSource":"./shared/alpha-beta-4.png","urlOptimized":"/build/alpha-beta-4-49895f9c64529305e635210b8a678694.webp"},{"type":"paragraph","position":{"start":{"line":240,"column":1},"end":{"line":241,"column":1}},"children":[{"type":"text","value":"There is still a chance that this action might outperform action one,\nso they continue expanding:","position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"key":"NFmt2hF1wz"}],"key":"P4mtFRhh0q"},{"type":"image","url":"/build/alpha-beta-5-1adbef535477fc346233514ea2f759a2.png","position":{"start":{"line":243,"column":1},"end":{"line":243,"column":1}},"key":"mdnaaY0zDS","urlSource":"./shared/alpha-beta-5.png","urlOptimized":"/build/alpha-beta-5-1adbef535477fc346233514ea2f759a2.webp"},{"type":"paragraph","position":{"start":{"line":245,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"Now they know taking action three leads to an outcome worse than action one,\nso they do not need to consider any further states.","position":{"start":{"line":245,"column":1},"end":{"line":245,"column":1}},"key":"PXacuObCoC"}],"key":"ENRZ6mi2Hv"}],"enumerator":"8.1","html_id":"alpha-beta-example","key":"abCWBLuwWw"},{"type":"code","lang":"python","value":"def alpha_beta_search(s, player, alpha, beta) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min, alpha, beta)\n if v > v_max:\n a_max, v_max = a, v\n alpha = max(alpha, v)\n if v_max >= beta:\n # we know Min will not choose the action that leads to this state\n return a_max, v_max\n return a_max, v_max\n\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n beta = min(beta, v)\n if v_min <= alpha:\n # we know Max will not choose the action that leads to this state\n return a_min, v_min\n return a_min, v_min","position":{"start":{"line":250,"column":1},"end":{"line":279,"column":1}},"key":"dbHUKbAebi"},{"type":"paragraph","position":{"start":{"line":281,"column":1},"end":{"line":289,"column":1}},"children":[{"type":"text","value":"How do we choose what ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"FeKHB9efPf"},{"type":"emphasis","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"children":[{"type":"text","value":"order","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"Xwc5864lQV"}],"key":"PJC33vdE3t"},{"type":"text","value":" to explore the branches?\nAs you can tell, this significantly affects the efficiency of the pruning algorithm.\nIf Max explores the possible actions in order from worst to best,\nthey will not be able to prune any branches at all!\nAdditionally, to verify that an action is suboptimal,\nwe must run the search recursively from that action,\nwhich ultimately requires traversing the tree all the way to a leaf node.\nThe longer the game might possibly last,\nthe more computation we have to run.","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"VhWnus9YUE"}],"key":"RZR9HfSQ74"},{"type":"paragraph","position":{"start":{"line":291,"column":1},"end":{"line":294,"column":1}},"children":[{"type":"text","value":"In practice, we can often use background information about the game to develop a ","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"PmJSDqKeYN"},{"type":"strong","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"heuristic","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"r3HNuQ2CFz"}],"key":"MQvkyVclmj"},{"type":"text","value":" for evaluating possible actions.\nIf a technique is based on background information or intuition,\nespecially if it isn’t rigorously justified,\nwe call it a heuristic.","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"GfLrojQiD7"}],"key":"DzTd0wGOfs"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"Can we develop ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"YOrGt4Oz2q"},{"type":"emphasis","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"heuristic methods","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"nHrQE2qGCb"}],"key":"jko9zk82GK"},{"type":"text","value":" for tree exploration that works for all sorts of games?","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"IwDAyLwkDA"}],"key":"aGGqQ2oCWS"},{"type":"comment","value":" Here's where we can incorporate the _reinforcement learning_ ","key":"iiepIHDnOE"},{"type":"heading","depth":2,"position":{"start":{"line":299,"column":1},"end":{"line":299,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":299,"column":1},"end":{"line":299,"column":1}},"key":"LhDlPKHfHu"}],"identifier":"monte-carlo-tree-search","label":"Monte Carlo Tree Search","html_id":"monte-carlo-tree-search","implicit":true,"enumerator":"8.5","key":"XAJBOS0yfi"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"text","value":"The task of evaluating actions in a complex environment might seem familiar.\nWe’ve encountered this problem before in both the ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"SG36or3JG6"},{"type":"link","url":"/bandits","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Jx7LnROTXU"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"TysYrl3oVD"},{"type":"text","value":" setting and the ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"pYSUXyvj38"},{"type":"link","url":"/mdps","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"Markov decision process","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"cEncwgy9xF"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"hSBtCIo2uZ"},{"type":"text","value":" setting.\nNow we’ll see how to combine concepts from these to form a more general and efficient tree search heuristic called ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"d6L7PNRKTu"},{"type":"strong","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Jmmnh0QNtM"}],"key":"s2R2eYUgwz"},{"type":"text","value":" (MCTS).","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"tIccOC9YMP"}],"key":"hSV6TdTQta"},{"type":"paragraph","position":{"start":{"line":305,"column":1},"end":{"line":310,"column":1}},"children":[{"type":"text","value":"When a problem is intractable to solve ","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"shnWXmcPmD"},{"type":"emphasis","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"M2XKZ7Qx2W"}],"key":"H8OaNvxJNt"},{"type":"text","value":",\nwe often turn to ","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"hKJjSwrRrd"},{"type":"emphasis","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"am99Mjtlz9"}],"key":"poHeN8ouCd"},{"type":"text","value":" or ","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"FYRy3efE76"},{"type":"emphasis","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"children":[{"type":"text","value":"randomized","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"VcQwL94RUw"}],"key":"nZ0yIwosJM"},{"type":"text","value":" algorithms that sacrifice some accuracy in exchange for computational efficiency.\nMCTS also improves on alpha-beta search in this sense.\nAs the name suggests,\nMCTS uses ","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"lXgkS2J55x"},{"type":"emphasis","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"children":[{"type":"text","value":"Monte Carlo","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"MKYe6S4VL4"}],"key":"fQpVEhG4Xl"},{"type":"text","value":" simulation, that is, collecting random samples and computing the sample statistics,\nin order to ","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"iz1Q5Ce2HU"},{"type":"emphasis","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"pbCwf2llQb"}],"key":"Ftfib21DRe"},{"type":"text","value":" the value of each action.","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"pfDbAnwkJU"}],"key":"h7YTL7e33j"},{"type":"paragraph","position":{"start":{"line":312,"column":1},"end":{"line":318,"column":1}},"children":[{"type":"text","value":"As before, we imagine a complete game tree in which each path represents an ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"MpTYc3xhS6"},{"type":"emphasis","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"entire game","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"Ph12MMKDpc"}],"key":"UUiDig3uyC"},{"type":"text","value":".\nThe goal of MCTS is to assign values to only the game states that are ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"j4P0Gv9J9B"},{"type":"emphasis","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"relevant","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"Q4wFkkABjq"}],"key":"rDd7A34zQo"},{"type":"text","value":" to the ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"ZHdM8ZRs5G"},{"type":"emphasis","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"current game","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"nRK2lxxExY"}],"key":"zdEM5j3itN"},{"type":"text","value":";\nWe gradually expand the tree at each move.\nFor comparison, in alpha-beta search,\nthe entire tree only needs to be solved ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"SglqdCl4Wn"},{"type":"emphasis","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"once","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"goFDw9di6N"}],"key":"FU69nqwFdX"},{"type":"text","value":",\nand from then on,\nchoosing an action is as simple as taking a maximum over the previously computed values.","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"h4r4a7fddG"}],"key":"Al4oOnk1Lu"},{"type":"paragraph","position":{"start":{"line":320,"column":1},"end":{"line":324,"column":1}},"children":[{"type":"text","value":"The crux of MCTS is approximating the win probability of a state by a ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"DqkF7WLEQD"},{"type":"emphasis","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"sample probability","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"bkBqTBJ6Po"}],"key":"d6oSiEVGFc"},{"type":"text","value":".\nIn practice, MCTS is used for games with ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"vOniUKAT9C"},{"type":"emphasis","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"binary outcomes","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"I2fMP671TU"}],"key":"ImesutBFee"},{"type":"text","value":" where ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"dvLQLwjBMm"},{"type":"inlineMath","value":"r(s) \\in \\{ +1, -1 \\}","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"r(s){+1,1}r(s) \\in \\{ +1, -1 \\}r(s){+1,1}","key":"BcXvGrfvVu"},{"type":"text","value":",\nand so this is equivalent to approximating the final game score.\nTo approximate the win probability from state ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"TTx1j4t1ST"},{"type":"inlineMath","value":"s","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"sss","key":"DWnyJZrPoY"},{"type":"text","value":",\nMCTS samples random games starting in ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"AL9zk92VEx"},{"type":"inlineMath","value":"s","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"sss","key":"UnncgIeRDX"},{"type":"text","value":" and computes the sample proportion of those that the player wins.","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"rMHhE8iV3I"}],"key":"gEthQDIeVU"},{"type":"paragraph","position":{"start":{"line":326,"column":1},"end":{"line":330,"column":1}},"children":[{"type":"text","value":"Note that, for a given state ","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"feUK2zNq6g"},{"type":"inlineMath","value":"s","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"html":"sss","key":"UIKrFzIuPZ"},{"type":"text","value":",\nchoosing the best action ","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"hKB2n2sQY7"},{"type":"inlineMath","value":"a","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"html":"aaa","key":"yTozZB73yG"},{"type":"text","value":" can be framed as a ","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"N4SgYlrQFC"},{"type":"link","url":"/bandits","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"wzypvvqNby"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"rFLhYerYMF"},{"type":"text","value":" problem,\nwhere each action corresponds to an arm,\nand the reward distribution of arm ","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"F83zLjEt4j"},{"type":"inlineMath","value":"k","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"html":"kkk","key":"YHKyhqUIOz"},{"type":"text","value":" is the distribution of the game score over random games after choosing that arm.\nThe most commonly used bandit algorithm in practice for MCTS is the ","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"zmWlkjmxC4"},{"type":"crossReference","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"JFvCOJmqg7"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"nKZEmiINqS"},{"type":"text","value":" algorithm.","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"key":"AK9o3W2LNa"}],"key":"iMCkro4XcM"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Summary of UCB","position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"key":"LaPJZLR2Nx"}],"key":"eyNClscroK"},{"type":"paragraph","position":{"start":{"line":333,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"Let us quickly review the UCB bandit algorithm.\nFor each arm ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"AVcLwtwloF"},{"type":"inlineMath","value":"k","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"kkk","key":"YvMqYVm5xn"},{"type":"text","value":", we track the sample mean","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"ACnBax7Dkf"}],"key":"FoGmB4g961"},{"type":"math","value":"\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tau","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"tight":true,"html":"μ^tk=1Ntkτ=0t11{aτ=k}rτ\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tauμ^tk=Ntk1τ=0t11{aτ=k}rτ","enumerator":"8.2","key":"X44PTkQAu1"},{"type":"paragraph","position":{"start":{"line":333,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"of all rewards from that arm up to time ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"rLbFAePSgP"},{"type":"inlineMath","value":"t","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"ttt","key":"zvO0mAyZbZ"},{"type":"text","value":".\nThen we construct a ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"q9DwlAc7Uq"},{"type":"emphasis","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"confidence interval","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"SoYlh1fD7O"}],"key":"Nyyt1CGCwy"},{"type":"text","value":"","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"RNk0BxLD6O"}],"key":"NHo0315R1c"},{"type":"math","value":"C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"tight":true,"html":"Ctk=[μ^tkBtk,μ^tk+Btk],C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],Ctk=[μ^tkBtk,μ^tk+Btk],","enumerator":"8.3","key":"bLQ3nCFzI3"},{"type":"paragraph","position":{"start":{"line":333,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"KPmtD0sPQq"},{"type":"inlineMath","value":"B_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"Btk=ln(2t/δ)2NtkB_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}Btk=2Ntkln(2t/δ)","key":"v1IdEawDAc"},{"type":"text","value":" is given by Hoeffding’s inequality,\nso that with probability ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"ULGi70Dvgp"},{"type":"text","value":"δ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"RsEbuFDm9r"},{"type":"text","value":" (some fixed parameter we choose),\nthe true mean ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"ZEuR7db4q8"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"μk\\mu^kμk","key":"s5GgvJI3Il"},{"type":"text","value":" lies within ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"YrMCrJNot5"},{"type":"inlineMath","value":"C_t^k","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"CtkC_t^kCtk","key":"rZ22REJtkH"},{"type":"text","value":".\nNote that ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"QKRkUyuAnN"},{"type":"inlineMath","value":"B_t^k","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"BtkB_t^kBtk","key":"YDYhpbjKTD"},{"type":"text","value":" scales like ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"M8QxxOVJ0N"},{"type":"inlineMath","value":"\\sqrt{1/N^k_t}","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"1/Ntk\\sqrt{1/N^k_t}1/Ntk","key":"bZC56k5aDD"},{"type":"text","value":",\ni.e. the more we have visited that arm,\nthe more confident we get about it,\nand the narrower the confidence interval.","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"iC2OZ1D9u9"}],"key":"idcuVVosjS"},{"type":"paragraph","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"children":[{"type":"text","value":"To select an arm, we pick the arm with the highest ","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"ZYUJcArSIP"},{"type":"emphasis","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"S2QMg2u4XS"}],"key":"AQ50XCWNSI"},{"type":"text","value":".","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"qgZDQUsbsN"}],"key":"FYF5XgCIhb"}],"key":"AU4NPxoPHt"},{"type":"paragraph","position":{"start":{"line":350,"column":1},"end":{"line":351,"column":1}},"children":[{"type":"text","value":"This means that, for each edge (corresponding to a state-action pair ","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"SWJU3GPRkD"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"ruZihJwSQP"},{"type":"text","value":") in the game tree,\nwe keep track of the statistics required to compute its UCB:","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"Ra1g4ykDb4"}],"key":"ZjQ3VGozrx"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":353,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"How many times it has been “visited” (","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"key":"oErWVoB9F6"},{"type":"inlineMath","value":"N_t^{s, a}","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"html":"Nts,aN_t^{s, a}Nts,a","key":"wICnuoJuB6"},{"type":"text","value":")","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"key":"l5zFjUx7KY"}],"key":"olaYqf3F4o"},{"type":"listItem","spread":true,"position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"text","value":"How many of those visits resulted in victory (","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"Fmuj0tTzM9"},{"type":"inlineMath","value":"\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tau","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"html":"τ=0t11{(sτ,aτ)=(s,a)}rτ\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tauτ=0t11{(sτ,aτ)=(s,a)}rτ","key":"u13kcnHHnT"},{"type":"text","value":").\nLet us call this latter value ","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"kW7Gnv73Bd"},{"type":"inlineMath","value":"W^{s, a}_t","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"html":"Wts,aW^{s, a}_tWts,a","key":"ilY674Psyz"},{"type":"text","value":" (for number of “wins”).","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"XVGvGVwN3P"}],"key":"ew8bvy9Iui"}],"key":"STSNea6N37"},{"type":"paragraph","position":{"start":{"line":357,"column":1},"end":{"line":364,"column":1}},"children":[{"type":"text","value":"What does ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"G65hZXosaL"},{"type":"inlineMath","value":"t","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"ttt","key":"ZETpgmef9C"},{"type":"text","value":" refer to in the above expressions?\nRecall ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"NJLdIGHgdJ"},{"type":"inlineMath","value":"t","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"ttt","key":"Y7XW2uUtBp"},{"type":"text","value":" refers to the number of time steps elapsed in the ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"cCclRQfUNn"},{"type":"emphasis","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"children":[{"type":"text","value":"bandit environment","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"CfptEZIZMD"}],"key":"fya8ydAyJs"},{"type":"text","value":".\nAs mentioned above,\neach state ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"u8PmolB8M5"},{"type":"inlineMath","value":"s","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"sss","key":"cPtv72AOsL"},{"type":"text","value":" corresponds to its own bandit environment,\nand so ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"Mn4ZUsGwuj"},{"type":"inlineMath","value":"t","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"ttt","key":"iD1Oh2ksbX"},{"type":"text","value":" refers to ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"uzLEbWbPOi"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"NsN^sNs","key":"EsHz354gHE"},{"type":"text","value":", that is,\nhow many actions have been taken from state ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"QOiqZ7427w"},{"type":"inlineMath","value":"s","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"sss","key":"wdRVrB1mTJ"},{"type":"text","value":".\nThis term, ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"F4U7ApI3WG"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"NsN^sNs","key":"py4fq1AbuI"},{"type":"text","value":", gets incremented as the algorithm runs;\nFor simplicity, we won’t introduce another index to track how it changes.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"UoTIRzr8z3"}],"key":"gHHYwWgWBN"},{"type":"proof","kind":"algorithm","label":"mcts-algorithm","identifier":"mcts-algorithm","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search algorithm","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"key":"PraV3dNEEv"}],"key":"DVw9vwqNnB"},{"type":"paragraph","position":{"start":{"line":369,"column":1},"end":{"line":369,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":369,"column":1},"end":{"line":369,"column":1}},"key":"pJPKJ1aNOa"}],"key":"U9t8z0SqoP"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":370,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"html":"TTT","key":"UzDZBgDYWw"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"IsIZeygsX9"}],"key":"qtCyMVpzsh"},{"type":"listItem","spread":true,"position":{"start":{"line":371,"column":1},"end":{"line":371,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_{\\text{rollout}}","position":{"start":{"line":371,"column":1},"end":{"line":371,"column":1}},"html":"πrollout\\pi_{\\text{rollout}}πrollout","key":"Ha0hdeGiUI"},{"type":"text","value":", the ","position":{"start":{"line":371,"column":1},"end":{"line":371,"column":1}},"key":"uEDHGwMIR2"},{"type":"strong","position":{"start":{"line":371,"column":1},"end":{"line":371,"column":1}},"children":[{"type":"text","value":"rollout policy","position":{"start":{"line":371,"column":1},"end":{"line":371,"column":1}},"key":"gNZBxyyv9S"}],"key":"e8NcPmuHXA"},{"type":"text","value":" for randomly sampling games","position":{"start":{"line":371,"column":1},"end":{"line":371,"column":1}},"key":"nBY89RoyTz"}],"key":"El33e9bfUt"},{"type":"listItem","spread":true,"position":{"start":{"line":372,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"html":"ccc","key":"iRz3Cf8XaS"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"VEhoOzIlZ3"}],"key":"kb54qryU6M"}],"key":"c7xNastzV8"},{"type":"paragraph","position":{"start":{"line":374,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"To choose a single move starting at state ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"SdbSe48qTN"},{"type":"inlineMath","value":"s_{\\text{start}}","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"sstarts_{\\text{start}}sstart","key":"DfrPUOdKhH"},{"type":"text","value":",\nMCTS first tries to estimate the UCB values for each of the possible actions ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"JU5mQn4qg9"},{"type":"inlineMath","value":"\\mathcal{A}(s_\\text{start})","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"A(sstart)\\mathcal{A}(s_\\text{start})A(sstart)","key":"FQ4cDAk0y1"},{"type":"text","value":",\nand then chooses the best one.\nTo estimate the UCB values,\nit repeats the following four steps ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"wzvE5sjG9Y"},{"type":"inlineMath","value":"T","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"TTT","key":"iUhXB1ABWU"},{"type":"text","value":" times:","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"WblOtM0cHL"}],"key":"gej1kG3Rx5"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":380,"column":1},"end":{"line":397,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":380,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"strong","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"IsDbPm8b8t"}],"key":"acaaXJBczo"},{"type":"text","value":": We start at ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"tGsbJyiwuW"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"W4ah8BRbIZ"},{"type":"text","value":". Let ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"el9ExaHVWm"},{"type":"text","value":"τ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"qR9tk5GIcZ"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"YwsYjreaUu"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":381,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":381,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"U9U3mqeKvX"},{"type":"inlineMath","value":"s","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"html":"sss","key":"WEaLAisSqX"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"nAQWfEKmjH"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":382,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":382,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"key":"NTIEqPOZ5e"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"zucpJIr5IW"},{"type":"text","value":", where\n","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"key":"ozxKZvU3Mp"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"identifier":"ucb-tree","label":"ucb-tree","html_id":"ucb-tree","html":"UCBs,a=Ws,aNs+clnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cNs,alnNs","enumerator":"8.4","key":"W9Sos9viWO"}],"key":"hyO5mG7p3I"},{"type":"listItem","spread":true,"position":{"start":{"line":387,"column":1},"end":{"line":387,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":387,"column":1},"end":{"line":387,"column":1}},"key":"oWTK96AhqO"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":387,"column":1},"end":{"line":387,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"aWUOWjdtxR"},{"type":"text","value":" to ","position":{"start":{"line":387,"column":1},"end":{"line":387,"column":1}},"key":"sqvhaovITk"},{"type":"text","value":"τ","position":{"start":{"line":387,"column":1},"end":{"line":387,"column":1}},"key":"Glb8tIA0EP"}],"key":"pShX4HKF8S"},{"type":"listItem","spread":true,"position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"OgxHzrw9Cg"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"oW8Cg0zpCm"}],"key":"i3Dfzvxc9y"}],"key":"MOtmx7VTQ8"}],"key":"Dehu6LbB9e"}],"key":"pgXg4nURS4"}],"key":"nNI06LBlOg"},{"type":"listItem","spread":true,"position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"children":[{"type":"strong","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"uG5QkwuIQK"}],"key":"FSBQZKpYxh"},{"type":"text","value":": Let ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"XEQuA9Kzs8"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"html":"snews_\\text{new}snew","key":"qECFTEBCfT"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"wFlz1qMRH4"},{"type":"text","value":"τ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"QDHahETm0X"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"EaoOO0DzqJ"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"html":"snews_\\text{new}snew","key":"EEQXR7uoR8"},{"type":"text","value":". Call it ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"ZaOPWRj6aK"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"html":"anewa_{\\text{new}}anew","key":"X2a4Tq2gER"},{"type":"text","value":". Add it to ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"QNhzNEWlxi"},{"type":"text","value":"τ","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"nYsz2AMWBb"},{"type":"text","value":".","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"jgfcA8rPLz"}],"key":"L3ojL0sSmo"},{"type":"listItem","spread":true,"position":{"start":{"line":390,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"strong","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"key":"TeXB0SzO9L"}],"key":"Sbweoy4jpY"},{"type":"text","value":": Simulate a complete game episode starting with the action ","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"key":"PKdzEyxUz5"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"html":"anewa_{\\text{new}}anew","key":"YnZIuDLJF2"},{"type":"text","value":"\nand then playing according to ","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"key":"bSc4nhpEEs"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"paqp6pJ7t9"},{"type":"text","value":".\nThis results in the outcome ","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"key":"vahxnmWOOU"},{"type":"inlineMath","value":"r \\in \\{ +1, -1 \\}","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"html":"r{+1,1}r \\in \\{ +1, -1 \\}r{+1,1}","key":"ED0k6PevMG"},{"type":"text","value":".","position":{"start":{"line":390,"column":1},"end":{"line":390,"column":1}},"key":"z1dea1mRba"}],"key":"uEinfJPrtK"},{"type":"listItem","spread":true,"position":{"start":{"line":393,"column":1},"end":{"line":397,"column":1}},"children":[{"type":"strong","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"IazWWILRiq"}],"key":"Ri2N87E5Z8"},{"type":"text","value":": For each ","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"TIQxcBdDiE"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"QtU7zY0wFD"},{"type":"text","value":":","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"a0VBo15dqp"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":394,"column":1},"end":{"line":397,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":394,"column":1},"end":{"line":394,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":394,"column":1},"end":{"line":394,"column":1}},"key":"S0zkkKpTBO"},{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":394,"column":1},"end":{"line":394,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"ZtTVkSPBoL"}],"key":"GkWCNYOgw6"},{"type":"listItem","spread":true,"position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"n5LQ1uCUpH"}],"key":"mBneMrGAwg"},{"type":"listItem","spread":true,"position":{"start":{"line":396,"column":1},"end":{"line":397,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":396,"column":1},"end":{"line":396,"column":1}},"key":"eFCev0uy4z"},{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":396,"column":1},"end":{"line":396,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"tCabIS4Eaf"}],"key":"IZ9xUWdp0r"}],"key":"PnJWVQlogV"}],"key":"EfK9NF2Nt9"}],"key":"qlskovX4PR"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"After ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"SN9756YzW2"},{"type":"inlineMath","value":"T","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"TTT","key":"kzhmrdJOuL"},{"type":"text","value":" repeats of the above,\nwe return the action with the highest UCB value ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"me2EvkIMaS"},{"type":"crossReference","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"children":[{"type":"text","value":"(","key":"BG2HJTTfxu"},{"type":"text","value":"8.4","key":"gvXD5hKwEv"},{"type":"text","value":")","key":"f44kGHREbI"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"CbuKzJFhxx"},{"type":"text","value":".\nThen play continues.","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"tjJeEYyEsP"}],"key":"vVfkhmENMN"},{"type":"paragraph","position":{"start":{"line":402,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Between turns, we can keep the subtree whose statistics we have visited so far.\nHowever, the rest of the tree for the actions we did ","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"tfIx2B57gj"},{"type":"emphasis","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"qvDefdYf1b"}],"key":"FLd2KDjg39"},{"type":"text","value":" end up taking gets discarded.","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"AYA3iiYOmk"}],"key":"eN9WwxgXdp"}],"enumerator":"8.2","html_id":"mcts-algorithm","key":"et5D0ZzClh"},{"type":"paragraph","position":{"start":{"line":406,"column":1},"end":{"line":407,"column":1}},"children":[{"type":"text","value":"The application which brought the MCTS algorithm to fame was DeepMind’s ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"SZMMwIGSKe"},{"type":"strong","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"children":[{"type":"text","value":"AlphaGo","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"H4OxjeKkeZ"}],"key":"e663AA87W9"},{"type":"text","value":" ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"tSoRUADsBn"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"QHRIuHj1FU"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"bY4vaY2t4Q"}],"key":"PmrCHuUon8"},{"type":"text","value":" (2016)","key":"jkLE0PT1rf"}],"enumerator":"1","key":"uRz4Euk0ly"},{"type":"text","value":".\nSince then, it has been used in numerous applications ranging from games to automated theorem proving.","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"CCgidgWUmv"}],"key":"O9XmX58Erw"},{"type":"paragraph","position":{"start":{"line":409,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"How accurate is this Monte Carlo estimation?\nIt might depend heavily on the rollout policy ","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"key":"hxbgkzhdTb"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"VMu07c61bM"},{"type":"text","value":".\nIf the distribution it induces over games is very different from the distribution seen during real gameplay,\nwe might end up with a poor approximation to the actual value of a state.","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"key":"OwjSuYKN1l"}],"key":"pDbyUPhg5t"},{"type":"heading","depth":3,"position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"Value approximation","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"veGMRKyJZV"}],"identifier":"value-approximation","label":"Value approximation","html_id":"value-approximation","implicit":true,"enumerator":"8.5.1","key":"YFp0zUqNZ8"},{"type":"paragraph","position":{"start":{"line":416,"column":1},"end":{"line":418,"column":1}},"children":[{"type":"text","value":"To remedy this,\nwe might make use of a value function ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"qRdiBmGtSB"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"g4O5a9cu93"},{"type":"text","value":" that more efficiently approximates the value of a state.\nThen, we can replace the simulation step of ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"WA2oUvZSUo"},{"type":"crossReference","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"children":[{"type":"text","value":"MCTS","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"AmSjROv30k"}],"identifier":"mcts-algorithm","label":"mcts-algorithm","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.2","resolved":true,"html_id":"mcts-algorithm","key":"aoIXd6jOqt"},{"type":"text","value":" with evaluating ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"KtAHgg0L3r"},{"type":"inlineMath","value":"r = v(P(s_\\text{new}, a_\\text{new}))","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"html":"r=v(P(snew,anew))r = v(P(s_\\text{new}, a_\\text{new}))r=v(P(snew,anew))","key":"x4jWN7HocD"},{"type":"text","value":".","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"bGinIlQnY8"}],"key":"zhWdoRg5B8"},{"type":"paragraph","position":{"start":{"line":420,"column":1},"end":{"line":421,"column":1}},"children":[{"type":"text","value":"We might also make use of a ","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"key":"jBYnmJwUEz"},{"type":"emphasis","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"key":"mUWlAKP9WY"}],"key":"Fi06TONDQ3"},{"type":"text","value":" function ","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"key":"LYXUFc2UaZ"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"html":"π:S(A)\\pi : \\mathcal{S} \\to \\triangle(\\mathcal{A})π:S(A)","key":"U1gXye15LK"},{"type":"text","value":" that provides “intuition” as to which actions are more valuable in a given state.\nWe can scale the “exploration” term of ","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"key":"o34SLn8AUf"},{"type":"crossReference","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"children":[{"type":"text","value":"(","key":"TmuGpWyIAm"},{"type":"text","value":"8.4","key":"Sxh6iW82KO"},{"type":"text","value":")","key":"q5BRCXn9Wu"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"rZP36kNULz"},{"type":"text","value":" according to the policy function’s outputs.","position":{"start":{"line":420,"column":1},"end":{"line":420,"column":1}},"key":"NiXfbNIg4A"}],"key":"jjBTT1M3iI"},{"type":"paragraph","position":{"start":{"line":423,"column":1},"end":{"line":424,"column":1}},"children":[{"type":"text","value":"Putting these together,\nwe can describe an updated version of MCTS that makes use of these value and policy functions:","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"nor4N7V6z4"}],"key":"tUNVYzPcHx"},{"type":"proof","kind":"algorithm","label":"mcts-policy-value","identifier":"mcts-policy-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search with policy and value functions","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"BeeOWc24XO"}],"key":"I6o6Tk61fz"},{"type":"paragraph","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"ohj5R5oz6G"}],"key":"ED4wAmFACh"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":430,"column":1},"end":{"line":434,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"html":"TTT","key":"GZ37svT3rB"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"key":"iQXjha8MAO"}],"key":"vxdQMBCCHu"},{"type":"listItem","spread":true,"position":{"start":{"line":431,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"inlineMath","value":"v","position":{"start":{"line":431,"column":1},"end":{"line":431,"column":1}},"html":"vvv","key":"IdUyMXEKLs"},{"type":"text","value":", a value function that evaluates how good a state is","position":{"start":{"line":431,"column":1},"end":{"line":431,"column":1}},"key":"I7BYorvf3h"}],"key":"xCgzf0QVSR"},{"type":"listItem","spread":true,"position":{"start":{"line":432,"column":1},"end":{"line":432,"column":1}},"children":[{"type":"text","value":"π","position":{"start":{"line":432,"column":1},"end":{"line":432,"column":1}},"key":"fcNOmunMux"},{"type":"text","value":", a policy function that encourages certain actions","position":{"start":{"line":432,"column":1},"end":{"line":432,"column":1}},"key":"y6JEOXnwRS"}],"key":"eLdd9O69EJ"},{"type":"listItem","spread":true,"position":{"start":{"line":433,"column":1},"end":{"line":434,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"html":"ccc","key":"XhZIspy7cO"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"Lu9jaJqHJo"}],"key":"mNjHzV0mX6"}],"key":"GO2AJTdBd2"},{"type":"paragraph","position":{"start":{"line":435,"column":1},"end":{"line":435,"column":1}},"children":[{"type":"text","value":"To select a move in state ","position":{"start":{"line":435,"column":1},"end":{"line":435,"column":1}},"key":"gbsMFdKRqI"},{"type":"inlineMath","value":"s_\\text{start}","position":{"start":{"line":435,"column":1},"end":{"line":435,"column":1}},"html":"sstarts_\\text{start}sstart","key":"XziNbz8bUr"},{"type":"text","value":", we repeat the following four steps ","position":{"start":{"line":435,"column":1},"end":{"line":435,"column":1}},"key":"Q13d3LoLw2"},{"type":"inlineMath","value":"T","position":{"start":{"line":435,"column":1},"end":{"line":435,"column":1}},"html":"TTT","key":"wip9ks19h6"},{"type":"text","value":" times:","position":{"start":{"line":435,"column":1},"end":{"line":435,"column":1}},"key":"sRmPxUKMWm"}],"key":"XouY1I7kth"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":437,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":437,"column":1},"end":{"line":445,"column":1}},"children":[{"type":"strong","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"D7TGHbJPsX"}],"key":"KslVUsB3RE"},{"type":"text","value":": We start at ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"nmP5fswTQB"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"lulygIe0LR"},{"type":"text","value":". Let ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"Asb2HOLJVY"},{"type":"text","value":"τ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"CGHAW4xFvM"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"NRuUDW49Ym"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":438,"column":1},"end":{"line":445,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":438,"column":1},"end":{"line":445,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"oO4dFLrly0"},{"type":"inlineMath","value":"s","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"html":"sss","key":"PbiFFQJrrZ"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"AEo2hbMRdg"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":439,"column":1},"end":{"line":445,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":439,"column":1},"end":{"line":443,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"xIDB7sNVeZ"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"wuFSpSmK30"},{"type":"text","value":", where\n","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"BSKRecuxfT"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\pi(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"identifier":"ucb-tree-policy","label":"ucb-tree-policy","html_id":"ucb-tree-policy","html":"UCBs,a=Ws,aNs+cπ(as)lnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\pi(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cπ(as)Ns,alnNs","enumerator":"8.5","key":"XpJTmxt2xD"}],"key":"wc2xBvQIhS"},{"type":"listItem","spread":true,"position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"key":"kkOudEQRdn"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"iYuN6uxSPJ"},{"type":"text","value":" to ","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"key":"HV92vedDSr"},{"type":"text","value":"τ","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"key":"wvkXt2eP6l"}],"key":"rElR0Ul0OS"},{"type":"listItem","spread":true,"position":{"start":{"line":445,"column":1},"end":{"line":445,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":445,"column":1},"end":{"line":445,"column":1}},"key":"Q9DP4B5AAz"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":445,"column":1},"end":{"line":445,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"EM7Pa0sIon"}],"key":"nFlsuRL4LY"}],"key":"QNeI9T9oyT"}],"key":"CHDB3SCDtV"}],"key":"lgZgeDdEMa"}],"key":"n91JEA1zN2"},{"type":"listItem","spread":true,"position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"children":[{"type":"strong","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"NaFbCcIO2X"}],"key":"h1QxbiXFwy"},{"type":"text","value":": Let ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"Xa6O8BWT7L"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"html":"snews_\\text{new}snew","key":"jEEDWvi4J3"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"ZB2FZWQA5B"},{"type":"text","value":"τ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"sndTOlmYsA"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"ilPvIe5VU4"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"html":"snews_\\text{new}snew","key":"s1FP00SIny"},{"type":"text","value":". Call it ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"JWVhcCZi3j"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"html":"anewa_{\\text{new}}anew","key":"JwZtm0wdci"},{"type":"text","value":". Add it to ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"Lz9qdf7eV2"},{"type":"text","value":"τ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"dZznnUZal5"},{"type":"text","value":".","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"P2e6m2L8cd"}],"key":"NcUAQ8tw1g"},{"type":"listItem","spread":true,"position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"children":[{"type":"strong","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"ZwN5UrBT4R"}],"key":"YHj1lHsWSl"},{"type":"text","value":": Evaluate ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"LbVTj6Ga58"},{"type":"inlineMath","value":"r = v(P(s_\\text{new}, a_\\text{new}))","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"r=v(P(snew,anew))r = v(P(s_\\text{new}, a_\\text{new}))r=v(P(snew,anew))","key":"jNqRGU4X8m"},{"type":"text","value":". This approximates the value of the game after taking the action ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"WBcTeXEiWI"},{"type":"inlineMath","value":"a_\\text{new}","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"anewa_\\text{new}anew","key":"Z6qbqgIL0h"},{"type":"text","value":".","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"TnMkltVivu"}],"key":"IsboVanFcB"},{"type":"listItem","spread":true,"position":{"start":{"line":448,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"strong","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"Z4j33DS7p9"}],"key":"ExyqYaYi1c"},{"type":"text","value":": For each ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"oiVLW3Dym3"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"XTjE5Ad8qb"},{"type":"text","value":":","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"pgbJapkjG8"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":449,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"UC0asNi8vA"},{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"BirNG7x4Ht"}],"key":"GlSUip2lvA"},{"type":"listItem","spread":true,"position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"boyY2Xzk35"}],"key":"HiCYwRyDOJ"},{"type":"listItem","spread":true,"position":{"start":{"line":451,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"psZD7saDqa"},{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"EeuBCrCeay"}],"key":"QiXe6fSCdf"}],"key":"ukNuIMk3Xm"}],"key":"LE3FOIT5vT"}],"key":"DHWjDiGL9l"},{"type":"paragraph","position":{"start":{"line":453,"column":1},"end":{"line":454,"column":1}},"children":[{"type":"text","value":"We finally return the action with the highest UCB value ","position":{"start":{"line":453,"column":1},"end":{"line":453,"column":1}},"key":"t4tFkxg0fi"},{"type":"crossReference","position":{"start":{"line":453,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"text","value":"(","key":"aiMFpuZntI"},{"type":"text","value":"8.5","key":"EgKe6Ov2NZ"},{"type":"text","value":")","key":"wxmEwpxkdl"}],"identifier":"ucb-tree-policy","label":"ucb-tree-policy","kind":"equation","template":"(%s)","enumerator":"8.5","resolved":true,"html_id":"ucb-tree-policy","key":"jVNv0qvv0V"},{"type":"text","value":".\nThen play continues. As before, we can reuse the tree across timesteps.","position":{"start":{"line":453,"column":1},"end":{"line":453,"column":1}},"key":"HgudoXPbyP"}],"key":"PLhVgVIern"}],"enumerator":"8.3","html_id":"mcts-policy-value","key":"lRhfdZKX1Q"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":463,"column":1}},"children":[{"type":"text","value":"How do we actually compute a useful ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"c0pvFhLjlx"},{"type":"text","value":"π","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"XlAc2Qaya9"},{"type":"text","value":" and ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"DixQYXluD3"},{"type":"inlineMath","value":"v","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"vvv","key":"vOP78kre22"},{"type":"text","value":"?\nIf we have some existing dataset of trajectories,\nwe could use ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"SIPm0eHqCI"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"rUbTSF1oIr"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"zUHlOnGnug"},{"type":"text","value":" (that is, imitation learning)\nto generate a policy ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"NGTjnkXw8w"},{"type":"text","value":"π","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"yfriDQM8PC"},{"type":"text","value":" via behavioral cloning\nand learn ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"NoSRsKczvx"},{"type":"inlineMath","value":"v","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"vvv","key":"ol4N0WZ1ws"},{"type":"text","value":" by regressing the game outcomes onto states.\nThen, plugging these into ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"Z97FVxgfZZ"},{"type":"crossReference","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"the above algorithm","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"OBMk3OYbQJ"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.3","resolved":true,"html_id":"mcts-policy-value","key":"xfvmE26XOr"},{"type":"text","value":"\nresults in a stronger policy by using tree search to “think ahead”.","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"i9cTyQPn24"}],"key":"Hxei2aRGti"},{"type":"paragraph","position":{"start":{"line":465,"column":1},"end":{"line":466,"column":1}},"children":[{"type":"text","value":"But we don’t have to stop at just one improvement step;\nwe could iterate this process via ","position":{"start":{"line":465,"column":1},"end":{"line":465,"column":1}},"key":"vATsxgcWdw"},{"type":"strong","position":{"start":{"line":465,"column":1},"end":{"line":465,"column":1}},"children":[{"type":"text","value":"self-play","position":{"start":{"line":465,"column":1},"end":{"line":465,"column":1}},"key":"WL9ozfzKg1"}],"key":"R1RupQ2tI9"},{"type":"text","value":".","position":{"start":{"line":465,"column":1},"end":{"line":465,"column":1}},"key":"ZxVBZlqviR"}],"key":"gwNkc91Ptj"},{"type":"heading","depth":3,"position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"text","value":"Self-play","position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"key":"sT371ywPqF"}],"identifier":"self-play","label":"Self-play","html_id":"self-play","implicit":true,"enumerator":"8.5.2","key":"kTLumaL0W7"},{"type":"paragraph","position":{"start":{"line":470,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"epTSPCFHEn"},{"type":"link","url":"#policy-iteration","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"sy8ybIpLlr"}],"urlSource":"#policy-iteration","key":"Lzj7H5IdQl"},{"type":"text","value":" algorithm from the ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"qnuyKRSU7f"},{"type":"link","url":"/mdps","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"MDPs","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"oYIjBSekCd"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"mgMlBAu70a"},{"type":"text","value":" chapter.\nPolicy iteration alternates between ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"FTNdVNNnKh"},{"type":"strong","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"myCTc626bL"}],"key":"yYl4v1kCxM"},{"type":"text","value":" (taking ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"B9aNEsJjew"},{"type":"inlineMath","value":"\\pI","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"error":true,"message":"Undefined control sequence: \\pI at position 1: \\̲p̲I̲","key":"IgASHfm45o"},{"type":"text","value":" and computing ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"yvLrNVLPtc"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"VπV^\\piVπ","key":"yAY1vjys9M"},{"type":"text","value":")\nand ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"YDRCGJ0P0K"},{"type":"strong","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"policy improvement","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"KRE4CBP0BV"}],"key":"nm9JJRP58N"},{"type":"text","value":" (setting ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"cJ0Vg26ieF"},{"type":"text","value":"π","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"SC838ScAKl"},{"type":"text","value":" to be greedy with respect to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"UsaMoikJOS"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"VπV^\\piVπ","key":"pvdURy8cBn"},{"type":"text","value":").\nAbove, we saw how MCTS can be thought of as a “policy improvement” operation:\nfor a given policy ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"znfrrkbqRM"},{"type":"inlineMath","value":"\\pi^0","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"π0\\pi^0π0","key":"tUbdHtz0QA"},{"type":"text","value":",\nwe can use it to influence MCTS.\nThe resulting algorithm is itself a policy ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"R35BziDff6"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"S47mQd35X2"},{"type":"text","value":" that maps from states to actions.\nNow, we can use ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"X81lLr2Wnq"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"tgfGOCXzjT"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"bjlY87ahio"},{"type":"text","value":"\nto obtain a new policy ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"PWYmkPiyh3"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"π1\\pi^1π1","key":"ioeKmFhuHA"},{"type":"text","value":" that imitates ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"SmKRQNMpd8"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"SqwVWH9wHg"},{"type":"text","value":".\nWe can now use ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"wciwz7DYke"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"π1\\pi^1π1","key":"GiqWLwHn9y"},{"type":"text","value":" to influence MCTS,\nand repeat.","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"Bl1a5vwHQx"}],"key":"XLRvRYjwTZ"},{"type":"proof","kind":"algorithm","label":"mcts-self-play","identifier":"mcts-self-play","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"MCTS with self-play","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"AQJcE3TgsJ"}],"key":"eK94BScm25"},{"type":"paragraph","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"w4tygtxHlq"}],"key":"R4rF7VUarN"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":487,"column":1},"end":{"line":491,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"text","value":"A parameterized policy ","position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"key":"MW4q3Ig7L5"},{"type":"inlineMath","value":"\\pi : \\Theta \\to \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"html":"π:ΘS(A)\\pi : \\Theta \\to \\mathcal{S} \\to \\triangle(\\mathcal{A})π:ΘS(A)","key":"o54DmkeWdK"}],"key":"S23EuYlEuW"},{"type":"listItem","spread":true,"position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"A parameterized value function ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"eGIFiuwf91"},{"type":"inlineMath","value":"v : \\Theta \\to \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"html":"v:ΘSRv : \\Theta \\to \\mathcal{S} \\to \\mathbb{R}v:ΘSR","key":"FyDmoxwlWi"}],"key":"sx9aJL1Sbp"},{"type":"listItem","spread":true,"position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"children":[{"type":"text","value":"A number of trajectories ","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"qlsNv2gfak"},{"type":"inlineMath","value":"M","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"html":"MMM","key":"yt5ywOd6m4"},{"type":"text","value":" to generate","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"Bv7n0Ji79W"}],"key":"cqRykRi7gk"},{"type":"listItem","spread":true,"position":{"start":{"line":490,"column":1},"end":{"line":491,"column":1}},"children":[{"type":"text","value":"The initial parameters ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"hXRBb4CU1h"},{"type":"inlineMath","value":"\\theta^0","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"html":"θ0\\theta^0θ0","key":"QF3nlIBEJp"}],"key":"GBh0ggpS97"}],"key":"EWUg4mwfEK"},{"type":"paragraph","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"Initialize ","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"O6H5TcIZLo"},{"type":"inlineMath","value":"\\theta \\gets \\theta^0","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"html":"θθ0\\theta \\gets \\theta^0θθ0","key":"PgcwVvkXxZ"},{"type":"text","value":".","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"emqAlLqmLa"}],"key":"kdcj7lIP2p"},{"type":"paragraph","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"key":"qH2wnS0uWp"},{"type":"inlineMath","value":"t = 0, \\dots, T-1","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"html":"t=0,,T1t = 0, \\dots, T-1t=0,,T1","key":"Qdy6zU7W68"},{"type":"text","value":":","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"key":"JfClEPg6cn"}],"key":"yQHjyYubgt"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"strong","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"Policy improvement","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"hEGpDqqYKg"}],"key":"MKcoTkB0eW"},{"type":"text","value":": Use ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"SobD6AC8OS"},{"type":"inlineMath","value":"\\pi_{\\theta}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"πθ\\pi_{\\theta}πθ","key":"TWal2MMBzb"},{"type":"text","value":" with MCTS to play against itself ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"FhZI67K7lL"},{"type":"inlineMath","value":"M","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"MMM","key":"RsDuLD0L7Y"},{"type":"text","value":" times. This generates ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"FsSKUMR261"},{"type":"inlineMath","value":"M","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"MMM","key":"xDjptbtdqN"},{"type":"text","value":" trajectories ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"DrtMOBtlj7"},{"type":"inlineMath","value":"\\tau_0, \\dots, \\tau_{M-1}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"τ0,,τM1\\tau_0, \\dots, \\tau_{M-1}τ0,,τM1","key":"znTrknkYup"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"af6YfWhb9S"}],"key":"utSFTaCRGt"},{"type":"listItem","spread":true,"position":{"start":{"line":497,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"strong","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"children":[{"type":"text","value":"Policy evaluation","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"key":"uAV8VPvk1c"}],"key":"QEzq55oW78"},{"type":"text","value":": Use behavioral cloning to mimic the behavior of the policy induced by MCTS. That is,\n","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"key":"zAbxvCxbDS"},{"type":"math","value":"\\theta \\gets \\argmin_\\theta - \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\pi_\\theta(a_\\hi \\mid s_\\hi)","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"html":"θarg minθm=0M1h=0H1logπθ(ahsh)\\theta \\gets \\argmin_\\theta - \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\pi_\\theta(a_\\hi \\mid s_\\hi)θθargminm=0M1h=0H1logπθ(ahsh)","enumerator":"8.6","key":"AZYHwDD9Fd"}],"key":"DbLDTu4njI"}],"key":"DAv7zR1PKI"}],"enumerator":"8.4","html_id":"mcts-self-play","key":"eIkzmO4y0g"},{"type":"heading","depth":2,"position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"children":[{"type":"text","value":"References","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"gFHHIltcPK"}],"identifier":"references","label":"References","html_id":"references","implicit":true,"enumerator":"8.6","key":"rDK7eHgzrX"},{"type":"paragraph","position":{"start":{"line":505,"column":1},"end":{"line":505,"column":1}},"children":[{"type":"text","value":"Chapter 5 of ","position":{"start":{"line":505,"column":1},"end":{"line":505,"column":1}},"key":"UxZOYKufPE"},{"type":"cite","kind":"narrative","label":"russell_artificial_2021","identifier":"russell_artificial_2021","children":[{"type":"text","value":"Russell & Norvig (2021)","key":"L37ltyWPjY"}],"enumerator":"2","key":"idFEeB1AH0"},{"type":"text","value":" provides an excellent overview of search methods in games.","position":{"start":{"line":505,"column":1},"end":{"line":505,"column":1}},"key":"wT8x9LngJE"}],"key":"UArzM63isp"}],"key":"HX9L60VxLo"}],"key":"UcHJ9LjdC9"},"references":{"cite":{"order":["silver_mastering_2016","russell_artificial_2021"],"data":{"silver_mastering_2016":{"label":"silver_mastering_2016","enumerator":"1","doi":"10.1038/nature16961","html":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., van den Driessche, G., Schrittwieser, J., Antonoglou, I., Panneershelvam, V., Lanctot, M., Dieleman, S., Grewe, D., Nham, J., Kalchbrenner, N., Sutskever, I., Lillicrap, T., Leach, M., Kavukcuoglu, K., Graepel, T., & Hassabis, D. (2016). Mastering the Game of Go with Deep Neural Networks and Tree Search. Nature, 529(7587), 484–489. 10.1038/nature16961","url":"https://doi.org/10.1038/nature16961"},"russell_artificial_2021":{"label":"russell_artificial_2021","enumerator":"2","html":"Russell, S. J., & Norvig, P. (2021). Artificial Intelligence: A Modern Approach (Fourth edition). Pearson."}}}},"footer":{"navigation":{"prev":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"5ad6f72255f948ee283927b483938dbb9b2b372614850f669c0034ff5fc30bdc","slug":"planning","location":"/planning.md","dependencies":[],"frontmatter":{"title":"8 Tree Search Methods","numbering":{"all":{"enabled":true},"enumerator":{"template":"8.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","thumbnailOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp","exports":[{"format":"md","filename":"planning.md","url":"/build/planning-7b5ef62df9036b73ec5f6119008db1f7.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"DrkHKuAHY8"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"8.1","key":"apsgGDM72h"},{"type":"paragraph","position":{"start":{"line":22,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Have you ever lost a strategy game against a skilled opponent?\nIt probably seemed like they were ahead of you at every turn.\nThey might have been ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"Ay8wqXguob"},{"type":"emphasis","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"planning ahead","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"PjdA72JHwb"}],"key":"PZKIceU8eM"},{"type":"text","value":" and anticipating your actions,\nthen planning around them in order to win.\nIf this opponent was a computer,\nthey might have been using one of the strategies that we are about to explore.","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"bITX0yZkRO"}],"key":"dVIa4jmYBt"},{"type":"heading","depth":2,"position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Deterministic, zero sum, fully observable two-player games","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"MJaTKvUkeM"}],"identifier":"deterministic-zero-sum-fully-observable-two-player-games","label":"Deterministic, zero sum, fully observable two-player games","html_id":"deterministic-zero-sum-fully-observable-two-player-games","implicit":true,"enumerator":"8.2","key":"PvSFKBkGLh"},{"type":"paragraph","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"In this chapter, we will focus on games that are:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"O25kI2JDLV"}],"key":"mZ0ZAU8d1w"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":33,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"deterministic,","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"kyOxBmRMZp"}],"key":"zJ9Xj8YvWK"}],"key":"lPm3G8u1Co"},{"type":"listItem","spread":true,"position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"zero sum","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"LyKwJszgQo"}],"key":"alb4xC0n0L"},{"type":"text","value":" (one player wins and the other loses),","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"fLCSX8t4y3"}],"key":"oH6Y3aZEOB"},{"type":"listItem","spread":true,"position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"fully observable,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"vaxIz1POPY"}],"key":"HQYQr6k8iR"},{"type":"text","value":" that is, the state of the game is perfectly known by both players,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"if6NU6NNps"}],"key":"KLF00wLBkD"},{"type":"listItem","spread":true,"position":{"start":{"line":36,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"for ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"J8n4yVTFDE"},{"type":"emphasis","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"two players","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"qmP4Q3YfoY"}],"key":"gFWKgREkEH"},{"type":"text","value":" that alternate turns,","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"WX1CAk5Pc5"}],"key":"ugckeA2Wzn"}],"key":"FRLrPFSaqM"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"We can represent such a game as a ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"LKJ4ofAoGh"},{"type":"emphasis","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"complete game tree.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"PEztQCnVaF"}],"key":"Nmgc7VOzAx"},{"type":"text","value":"\nEach possible state is a node in the tree,\nand since we only consider deterministic games,\nwe can represent actions as edges leading from the current state to the next.\nEach path through the tree, from root to leaf, represents a single game.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"TGxE1v7IHd"}],"key":"S99pzBe2HH"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","alt":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"EoJ42K2qhM","urlSource":"shared/tic_tac_toe.png","urlOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"tDkSkr6iMk"}],"key":"f3enJzbyAh"}],"key":"frIAstDnxG"}],"enumerator":"8.1","key":"GpptOhTzQw"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"If you could store the complete game tree on a computer,\nyou would be able to win every potentially winnable game\nby searching all paths from your current state and taking a winning move.\nWe will see an explicit algorithm for this in ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"drYDTWzPFJ"},{"type":"crossReference","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"the next section","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"DXqVVq9K5D"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"OOGviuGNB4"},{"type":"text","value":".\nHowever, as games become more complex,\nit becomes computationally impossible to search every possible path.","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"WtRb2MqvFZ"}],"key":"fugg1lbAh2"},{"type":"paragraph","position":{"start":{"line":58,"column":1},"end":{"line":66,"column":1}},"children":[{"type":"text","value":"For instance,\na chess player has roughly 30 actions to choose from at each turn,\nand each game takes roughly 40 moves per player,\nso trying to solve chess exactly using minimax\nwould take somewhere on the order of ","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"DzhRYZ3jr9"},{"type":"inlineMath","value":"30^{80} \\approx 10^{118}","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"html":"30801011830^{80} \\approx 10^{118}308010118","key":"QKNswIhj26"},{"type":"text","value":" operations.\nThat’s 10 billion billion billion billion billion billion billion billion billion billion billion billion billion operations.\nAs of the time of writing,\nthe fastest processor can achieve almost 10 GHz (10 billion operations per second),\nso to fully solve chess using minimax is many, many orders of magnitude out of reach.","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"U7nn48FZvn"}],"key":"ruebTJR6uV"},{"type":"paragraph","position":{"start":{"line":68,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"It is thus intractable, in any realistic setting, to solve the complete game tree exactly.\nLuckily, only a small fraction of those games ever occur in reality;\nLater in this chapter,\nwe will explore ways to ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"hilWIQygtl"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"prune away","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"EoJavf2Zld"}],"key":"SQrArEGrVR"},{"type":"text","value":" parts of the tree that we know we can safely ignore.\nWe can also ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"TwcMU0zU7I"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"irobc49ft9"}],"key":"pChlnGtgUr"},{"type":"text","value":" the value of a state without fully evaluating it.\nUsing these approximations, we can no longer ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"LDdjtUgDYn"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"guarantee","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"kC94G5LzvZ"}],"key":"jOv4h5uc9A"},{"type":"text","value":" winning the game,\nbut we can come up with strategies that will do well against most opponents.","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"CQYq4DoWNp"}],"key":"axs5wzDAZz"},{"type":"heading","depth":3,"position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"weUAxLITcs"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"8.2.1","key":"lp6rZ36wxe"},{"type":"paragraph","position":{"start":{"line":78,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Let us now describe these games formally.\nWe’ll call the first player Max and the second player Min.\nMax seeks to maximize the final game score,\nwhile Min seeks to minimize the final game score.","position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"key":"YlQTIKptii"}],"key":"oJ8UuJWAfc"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":83,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"We’ll use ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"AI07z0P4Jn"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"blzGJgwBQ8"},{"type":"text","value":" to denote the set of all possible game states.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"o70EmLH50v"}],"key":"cWzyYogPOM"},{"type":"listItem","spread":true,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"The game begins in some ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"NIjvHwReSj"},{"type":"strong","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"initial state","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"y2JK36JzDL"}],"key":"bLUqCyJ2oM"},{"type":"text","value":" ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"R7pxUDjoyx"},{"type":"inlineMath","value":"s_0 \\in \\mathcal{S}","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"html":"s0Ss_0 \\in \\mathcal{S}s0S","key":"NsTnx4azAn"},{"type":"text","value":".","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"bI5Bea2nZB"}],"key":"XmJgrng4wg"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Max moves on even turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"NY4BlPwuOz"},{"type":"inlineMath","value":"h = 2n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2nh = 2nh=2n","key":"itICrhln0s"},{"type":"text","value":",\nand Min moves on odd turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"YPAgXmOi1D"},{"type":"inlineMath","value":"h = 2n+1","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2n+1h = 2n+1h=2n+1","key":"LjqmZs3kqn"},{"type":"text","value":",\nwhere ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"lmB7OfiAPc"},{"type":"inlineMath","value":"n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"nnn","key":"ACacTvPaVj"},{"type":"text","value":" is a natural number.","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"A1fLaHMrPn"}],"key":"j1044p4aTV"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"The space of possible actions, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"vXCWmOqTWT"},{"type":"inlineMath","value":"\\mathcal{A}_h(s)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"Ah(s)\\mathcal{A}_h(s)Ah(s)","key":"PAEYH8cPgp"},{"type":"text","value":",\ndepends on the state itself, as well as whose turn it is.\n(For example, in tic-tac-toe, Max can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"aneXq0R5Yc"},{"type":"inlineCode","value":"X","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"OUG7Re7dN9"},{"type":"text","value":"s while Min can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ZoHuSAyXZv"},{"type":"inlineCode","value":"O","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"m9lnpJa3fa"},{"type":"text","value":"s.)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"HVGvHq3e87"}],"key":"CHSodUY5b6"},{"type":"listItem","spread":true,"position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"The game ends after ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"Et23gUckuq"},{"type":"inlineMath","value":"H","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"html":"HHH","key":"G7YUeJkXn1"},{"type":"text","value":" total moves (which might be even or odd). We call the final state a ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"zYoRbHos0v"},{"type":"strong","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"terminal state","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"dEvFGr7nyv"}],"key":"goKYDaGrf1"},{"type":"text","value":".","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"FWwtyAy3Oj"}],"key":"jA3lSg9avS"},{"type":"listItem","spread":true,"position":{"start":{"line":92,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"inlineMath","value":"P","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"PPP","key":"TN3Cfm5LKh"},{"type":"text","value":" denotes the ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"j0WyMVvqXz"},{"type":"strong","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"wsV2PkqtVu"}],"key":"g0QSrJQizh"},{"type":"text","value":", that is,\n","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"vpHT9MF6Lc"},{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"HsdI1vuIwg"},{"type":"text","value":" denotes the resulting state when taking action ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"c0eC2yShtE"},{"type":"inlineMath","value":"a \\in \\mathcal{A}(s)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"aA(s)a \\in \\mathcal{A}(s)aA(s)","key":"xr5RP7rBJM"},{"type":"text","value":" in state ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"LERX6fvQFw"},{"type":"inlineMath","value":"s","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"sss","key":"T6OrRNZLfM"},{"type":"text","value":".","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"MtBHcJsT6F"}],"key":"AdCdfeSx4P"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r(s)r(s)r(s)","key":"PoKGO3jHql"},{"type":"text","value":" denotes the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"K7ajyVlXSY"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"game score","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Dwpc3VkwvH"}],"key":"LButVLC7Oe"},{"type":"text","value":" of the terminal state ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"UwPNC3IwLy"},{"type":"inlineMath","value":"s","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"sss","key":"rE5wVUDafO"},{"type":"text","value":".\nNote that this is some positive or negative value seen by both players:\nA positive value indicates Max winning, a negative value indicates Min winning, and a value of ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Gn3xjyBwyN"},{"type":"text","value":"0","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"CCDaM9JBoa"},{"type":"text","value":" indicates a tie.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EtQSewOEXG"}],"key":"XyaBRCmFB9"}],"key":"qpxXZdr8hD"},{"type":"paragraph","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"We also call the sequence of states and actions a ","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"BY9efIvE3r"},{"type":"strong","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"Y0aEtVF7ZG"}],"key":"C5wwpLKfXz"},{"type":"text","value":".","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"h50SRkrSLp"}],"key":"I5y8MpOydg"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"fVQUBYTLqk"}],"key":"UJLA2OdFND"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Above, we suppose that the game ends after ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"zz0tjf1gLT"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"jdRsyjhQ3g"},{"type":"text","value":" total moves.\nBut most real games have a ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"gviYROzbgT"},{"type":"emphasis","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"variable","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"D4av2vma4H"}],"key":"dShqpD7Gin"},{"type":"text","value":" length.\nHow would you describe this?","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"wXgbyzrFUS"}],"key":"Q3X5nnFfaR"}],"key":"LRSCimHIA1"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Let us frame tic-tac-toe in this setting.","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"rD7VRdRqTp"}],"key":"NbnROuuLrf"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":108,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"Each of the ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"F293XGI7sH"},{"type":"text","value":"9","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"NH4IQi2Yvq"},{"type":"text","value":" squares is either empty, marked X, or marked O.\nSo there are ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"VOgLKYOYEW"},{"type":"inlineMath","value":"|\\mathcal{S}| = 3^9","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"html":"S=39|\\mathcal{S}| = 3^9S=39","key":"M9tSOGbAKz"},{"type":"text","value":" potential states.\nNot all of these may be reachable!","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"qXv4vW8i3L"}],"key":"djtCr97X6V"},{"type":"listItem","spread":true,"position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"children":[{"type":"text","value":"The initial state ","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"yoCqcY14uD"},{"type":"inlineMath","value":"s_0","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"html":"s0s_0s0","key":"tqMEeU3bio"},{"type":"text","value":" is the empty board.","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"Fahf7pECTB"}],"key":"I0cPTOoyUH"},{"type":"listItem","spread":true,"position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"The set of possible actions for Max in state ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"hLRQRQ8Ccv"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"pzfBaA8lcw"},{"type":"text","value":", ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"LYchOdmdVP"},{"type":"inlineMath","value":"\\mathcal{A}_{2n}(s)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"A2n(s)\\mathcal{A}_{2n}(s)A2n(s)","key":"pBj8pJ7Fxi"},{"type":"text","value":", is the set of tuples ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"l5tu3MDvTr"},{"type":"inlineMath","value":"(\\text{``X''}, i)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"(“X”,i)(\\text{``X''}, i)(“X”,i)","key":"jfvo3PvgQS"},{"type":"text","value":" where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"KkK72rBL4D"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"VHcv5jwj8G"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"yrYczAY4E8"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"YtFuD1m3uJ"},{"type":"text","value":".\nSimilarly, ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"iNJ2v3JT8d"},{"type":"inlineMath","value":"\\mathcal{A}_{2n+1}(s)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"A2n+1(s)\\mathcal{A}_{2n+1}(s)A2n+1(s)","key":"r6qULkIlrj"},{"type":"text","value":" is the set of tuples ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"GKu1YZ8CUP"},{"type":"inlineMath","value":"(\\text{``O''}, i)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"(“O”,i)(\\text{``O''}, i)(“O”,i)","key":"pZuNZ529gj"},{"type":"text","value":" where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"kO8RqH4Wf3"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"JjxkvBNtIt"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"y8bVxYiGe9"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"XFEWYbQ5fC"},{"type":"text","value":".","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"g3fi7TEnTq"}],"key":"rLsSAdyoLO"},{"type":"listItem","spread":true,"position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"We can take ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"InyjeEVoQQ"},{"type":"inlineMath","value":"H = 9","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"H=9H = 9H=9","key":"FTgswr3Q6X"},{"type":"text","value":" as the longest possible game length.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"zUUFuobJdF"}],"key":"Y32ccvnUMr"},{"type":"listItem","spread":true,"position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"gujw1MdpIi"},{"type":"text","value":" for a ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"nuSlmZMRu4"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"DCUWWkrVHN"}],"key":"VgOfsw98kJ"},{"type":"text","value":" state ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"ZZ9X7EtdtK"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"RDo8R6LvpN"},{"type":"text","value":" is simply the board with the symbol and square specified by ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"l1LHNh7WEs"},{"type":"inlineMath","value":"a","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"aaa","key":"KMVPVwfa5C"},{"type":"text","value":" marked into ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"LYUNxQmldS"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"hl8FRGG8Y4"},{"type":"text","value":". Otherwise, if ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"Ht8efQbxx4"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"UftON1JN3a"},{"type":"text","value":" is a ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"ZnejDmOuea"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"aoRiFTPbmR"}],"key":"oWwMy4yRN8"},{"type":"text","value":" state, i.e. it already has three symbols in a row, the state no longer changes.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"VH7rcmlw2G"}],"key":"bPTQdNjwhw"},{"type":"listItem","spread":true,"position":{"start":{"line":116,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"r(s)r(s)r(s)","key":"HhCoZVMAaM"},{"type":"text","value":" at a ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"qs1FrUGr75"},{"type":"emphasis","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"P8XsqpXfA5"}],"key":"VefEYDZSqF"},{"type":"text","value":" state is ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"kKP6a6JZm7"},{"type":"text","value":"+1","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"v74rzp3ASW"},{"type":"text","value":" if there are three Xs in a row, ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"xPo4auod06"},{"type":"text","value":"-1","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"LspZoFj5pA"},{"type":"text","value":" if there are three Os in a row, and ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"RjC7x9mWyu"},{"type":"text","value":"0","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"y0rIBYEw6x"},{"type":"text","value":" otherwise.","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"Gwoaa9FXNd"}],"key":"SD1Fk7IWav"}],"key":"RMbjwkFVFQ"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Our notation may remind you of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"XfrxIN0yYF"},{"type":"link","url":"/mdps","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"CfNGCrTFql"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"rxV5RZwOgX"},{"type":"text","value":".\nGiven that these games also involve a sequence of states and actions,\ncan we formulate them as finite-horizon MDPs?\nThe two settings are not exactly analogous,\nsince in MDPs we only consider a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"ycRT0VMhkm"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"single","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"wxjfusMAA3"}],"key":"omVenZb0Ob"},{"type":"text","value":" policy,\nwhile these games involve two distinct players with opposite objectives.\nSince we want to analyze the behavior of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"XM3S9ZxaXJ"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"both","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"YvIKJ7LJBN"}],"key":"QyBUNSQQwF"},{"type":"text","value":" players at the same time,\ndescribing such a game as an MDP is more trouble than it’s worth.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AfT703E779"}],"key":"iSLxJGOsNh"},{"type":"heading","depth":2,"position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"children":[{"type":"text","value":"Min-max search *","position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"key":"vkmHM73lv1"}],"label":"min-max-search","identifier":"min-max-search","html_id":"min-max-search","enumerator":"8.3","key":"utJpfIY3Nw"},{"type":"admonition","kind":"important","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Important","key":"hjcISDrBws"}],"key":"GbGOwfvL7X"},{"type":"paragraph","position":{"start":{"line":131,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"The course (Fall 2024) does not cover min-max search.\nThis content is here to provide background on ","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"mqz4zxUTki"},{"type":"emphasis","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"text","value":"optimally","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"NmYKEbhqQi"}],"key":"yzngu4XDjq"},{"type":"text","value":" solving these deterministic, zero-sum, two-player games.","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"cOqzksOKbr"}],"key":"BpUWkLvY9y"}],"key":"upr7iOg3zI"},{"type":"paragraph","position":{"start":{"line":135,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"In the introduction,\nwe claimed that we could win any potentially winnable game by looking ahead and predicting the opponent’s actions.\nThis would mean that each ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"O9wNFOpMMU"},{"type":"emphasis","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"OyjlfPq3HM"}],"key":"f9ldmQp5rV"},{"type":"text","value":" state already has some predetermined game score,\nthat is, in each state,\nit is already “obvious” which player is going to win.\nLet ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"t85z1BVt7E"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"efRuC8oiVM"},{"type":"text","value":" denote the game score under optimal play starting in state ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"rRRpn1wUFd"},{"type":"inlineMath","value":"s","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"sss","key":"DVHwGQwAzh"},{"type":"text","value":" at time ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"JmzHOPHNXF"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"h\\hih","key":"ayrIvcb1Mk"},{"type":"text","value":".\nWe can compute this by starting at the terminal states,\nwhen the game’s outcome is known,\nand working backwards,\nassuming that Max chooses the action that leads to the highest score\nand Min chooses the action that leads to the lowest score.","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"dlNSRwkFLO"}],"key":"ZO70CBYG0Q"},{"type":"proof","kind":"algorithm","label":"min-max-value","identifier":"min-max-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search algorithm","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"NVhRVn9wPF"}],"key":"SkRB3W5SPO"},{"type":"math","value":"V_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is even and } h < H \\\\\n\\min_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is odd and } h < H \\\\\n\\end{cases}","position":{"start":{"line":150,"column":1},"end":{"line":156,"column":1}},"html":"Vh(s)={r(s)h=HmaxaA(s)Vh+1(P(s,a))h is even and h<HminaA(s)Vh+1(P(s,a))h is odd and h<HV_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is even and } h < H \\\\\n\\min_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is odd and } h < H \\\\\n\\end{cases}Vh(s)=r(s)maxaA(s)Vh+1(P(s,a))minaA(s)Vh+1(P(s,a))h=Hh is even and h<Hh is odd and h<H","enumerator":"8.1","key":"GozfG3N0Xo"}],"enumerator":"8.1","html_id":"min-max-value","key":"RSkqY3iEr7"},{"type":"paragraph","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"This translates directly into a recursive depth-first search algorithm for searching the complete game tree.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"gtCq7su4Qt"}],"key":"LuFsgZzsOo"},{"type":"code","lang":"python","value":"def minimax_search(s, player) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min)\n if v > v_max:\n a_max, v_max = a, v\n return a_max, v_max\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n return a_min, v_min","position":{"start":{"line":161,"column":1},"end":{"line":181,"column":1}},"key":"r1vusfAFrd"},{"type":"proof","kind":"example","label":"min-max-example","identifier":"min-max-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search for a simple game","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"IULJuDeFjU"}],"key":"GpnOstJe6f"},{"type":"paragraph","position":{"start":{"line":186,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"Consider a simple game: Max chooses one of three possible actions (A, B, C),\nMin chooses one of three possible actions (D, E, F),\nand the combination leads to a certain integer outcome,\nshown in the table below:","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"wlxL2nghiT"}],"key":"S1YW5CTDeV"},{"type":"table","position":{"start":{"line":191,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[],"key":"B79fy6yLMe"},{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"D","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"KM57oTdVdo"}],"key":"hKciCSrWwB"},{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"E","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"x7LoIb3FqI"}],"key":"lrsAXlwi5s"},{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"F","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"PI19rrTOV5"}],"key":"pCs4PjAC8i"}],"key":"yJUp4wseSM"},{"type":"tableRow","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"A","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"TZAmDQdUZ9"}],"key":"lAWveqQUsL"},{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"DQHzawHgI1"}],"key":"QvMv20LsTQ"},{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"-2","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"umyj6I6GOC"}],"key":"kNQLIBszC9"},{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"UtvGpbtf9a"}],"key":"jqMyuyurNv"}],"key":"LzRCkVTGP2"},{"type":"tableRow","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"B","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"Kt5EDRdQd5"}],"key":"u25Ux6Q1t6"},{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"-3","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"eadYS7Hpi2"}],"key":"x02bLzWjzj"},{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"zKAIYZWwL8"}],"key":"h9L3SUkCSz"},{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"i3xxSrKf7E"}],"key":"a62qvlx4q3"}],"key":"NqdXNJyv4G"},{"type":"tableRow","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"C","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"VFDjNzymAi"}],"key":"ZMoldG39Pw"},{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"FlNALbtrP4"}],"key":"Md8sw5AVoJ"},{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"t03P90ZUXl"}],"key":"dzeAxVbOYR"},{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"YL6FkOsuej"}],"key":"Cs3D8v2ULT"}],"key":"GSW4TPsQY6"}],"key":"DoRgUb05gS"},{"type":"paragraph","position":{"start":{"line":197,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"We can visualize this as the following complete game tree,\nwhere each box contains the value ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"CjbuxBtMc2"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"Mc21uy9Sgi"},{"type":"text","value":" of that node.\nThe min-max values of the terminal states are already known:","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"jYmEm6i0CB"}],"key":"L9kaA4mRsO"},{"type":"image","url":"/build/minmax-70b17e866836d498d3d814fd3fc3d9e3.png","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"IcqggvlJhT","urlSource":"./shared/minmax.png","urlOptimized":"/build/minmax-70b17e866836d498d3d814fd3fc3d9e3.webp"},{"type":"paragraph","position":{"start":{"line":203,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"We begin min-max search at the root,\nexploring each of Max’s actions.\nSuppose Max chooses action A.\nThen Min will choose action E to minimize the game score,\nmaking the value of this game node ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"i4vZYdXL1U"},{"type":"inlineMath","value":"\\min(4, -2, 5) = -2","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"min(4,2,5)=2\\min(4, -2, 5) = -2min(4,2,5)=2","key":"Tke0T5ybMU"},{"type":"text","value":".","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"uZ6gvi4rHF"}],"key":"S2sp1e3E3Y"},{"type":"image","url":"/build/minmax-2-d2c05b455ad2a4aef499542eadb0515d.png","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"vuYwMkXHlS","urlSource":"./shared/minmax-2.png","urlOptimized":"/build/minmax-2-d2c05b455ad2a4aef499542eadb0515d.webp"},{"type":"paragraph","position":{"start":{"line":211,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Similarly, if Max chooses action A,\nthen Min will choose action D,\nand if Max chooses action C,\nthen Min will choose action F.\nWe can fill in the values of these nodes accordingly:","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"PP0Ex1HTJC"}],"key":"dS4mKXIDYm"},{"type":"image","url":"/build/minmax-3-f38c4f0467ce1216f1438052ec8a7d85.png","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"oNNZdrOCaj","urlSource":"./shared/minmax-3.png","urlOptimized":"/build/minmax-3-f38c4f0467ce1216f1438052ec8a7d85.webp"},{"type":"paragraph","position":{"start":{"line":219,"column":1},"end":{"line":220,"column":1}},"children":[{"type":"text","value":"Thus, Max’s best move is to take action C,\nresulting in a game score of ","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"joIPZ81Fvi"},{"type":"inlineMath","value":"\\max(-2, -3, -1) = -1","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"html":"max(2,3,1)=1\\max(-2, -3, -1) = -1max(2,3,1)=1","key":"JuRiQS4amq"},{"type":"text","value":".","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"IyvSBd1uQE"}],"key":"feqdjePyUW"},{"type":"image","url":"/build/minmax-4-013da4f214c0c822edc5b0e2b62d2f2a.png","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"grUIqd1aR9","urlSource":"./shared/minmax-4.png","urlOptimized":"/build/minmax-4-013da4f214c0c822edc5b0e2b62d2f2a.webp"}],"enumerator":"8.1","html_id":"min-max-example","key":"qvXyd4MROr"},{"type":"heading","depth":3,"position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"children":[{"type":"text","value":"Complexity of min-max search","position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"key":"Gp7ktxrg0u"}],"identifier":"complexity-of-min-max-search","label":"Complexity of min-max search","html_id":"complexity-of-min-max-search","implicit":true,"enumerator":"8.3.1","key":"UikbYmSNE4"},{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"QnLwZHVHnn"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"html":"H\\horH","key":"QuY0EYNA6F"},{"type":"text","value":" timesteps,\nthis algorithm iterates through the entire action space at that state,\nand therefore has a time complexity of ","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"mevcGpOY8v"},{"type":"inlineMath","value":"\\hor^{n_A}","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"html":"HnA\\hor^{n_A}HnA","key":"hT1x1p5kSh"},{"type":"text","value":"\n(where ","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"xi94nHhgWM"},{"type":"inlineMath","value":"n_A","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"html":"nAn_AnA","key":"v07EJHi865"},{"type":"text","value":" is the largest number of actions possibly available at once).\nThis makes the min-max algorithm impractical for even moderately sized games.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"yT9pcH9tAX"}],"key":"D2Ezs8vCAF"},{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"text","value":"But do we need to compute the exact value of ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"LNNP5wz0Oh"},{"type":"emphasis","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"every","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"mGmPjMeOpa"}],"key":"XyP1YK0zzC"},{"type":"text","value":" possible state?\nInstead, is there some way we could “ignore” certain actions and their subtrees\nif we already know of better options?\nThe ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"ec21bqWkuK"},{"type":"strong","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"alpha-beta search","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"YstXB7cKxv"}],"key":"ElcooePJUC"},{"type":"text","value":" makes use of this intuition.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"Fip2NTPYFQ"}],"key":"K9YxDfALJq"},{"type":"heading","depth":2,"position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"KE5vD6yXec"}],"label":"alpha-beta-search","identifier":"alpha-beta-search","html_id":"alpha-beta-search","enumerator":"8.4","key":"jW6Srgxo4o"},{"type":"paragraph","position":{"start":{"line":241,"column":1},"end":{"line":245,"column":1}},"children":[{"type":"text","value":"The intuition behind alpha-beta search is as follows:\nSuppose Max is in state ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"bUjY34N5Ah"},{"type":"inlineMath","value":"s","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"sss","key":"RO8D4CClUS"},{"type":"text","value":",\nand considering whether to take action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"vm6xpl9BK1"},{"type":"inlineMath","value":"a","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aaa","key":"ffNqSKR9GW"},{"type":"text","value":" or ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"kvZASw2O9Z"},{"type":"inlineMath","value":"a'","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aa'a","key":"J0vJlx9nk3"},{"type":"text","value":".\nIf at any point they find out that action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"ZG1Zmz0Ogy"},{"type":"inlineMath","value":"a'","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aa'a","key":"LZ4Ke8iEum"},{"type":"text","value":" is definitely worse than (or equal to) action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"WbuE1yP4Tj"},{"type":"inlineMath","value":"a","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aaa","key":"MfGTcuqVzw"},{"type":"text","value":",\nthey don’t need to evaluate action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"E4KiFQMXxY"},{"type":"inlineMath","value":"a'","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aa'a","key":"u3XpHZGRxe"},{"type":"text","value":" any further.","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"T60kfBaKpK"}],"key":"uUHkdOHAFj"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":264,"column":1}},"children":[{"type":"text","value":"Concretely, we run min-max search as above,\nexcept now we keep track of two additional parameters ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"jLS01Xu1a8"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"HGrG3EjDh6"},{"type":"text","value":" and ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"pXQ4itIaz9"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"BchAExTGCi"},{"type":"text","value":" while evaluating each state.\nSuppose we are evaluating ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"VfOOUBRjTt"},{"type":"inlineMath","value":"V^\\star_\\hi(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"Vh(s)V^\\star_\\hi(s)Vh(s)","key":"wxEwUusNN1"},{"type":"text","value":",\nwhere it is Max’s turn (","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"EAjXPSFZwN"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"h\\hih","key":"kBHawtEQFA"},{"type":"text","value":" is even).\nWe update ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"tmM7zYw1Al"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"oUdlEt3LUI"},{"type":"text","value":" to be the ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"WemWfpDQqp"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"highest","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"klV4BTNGqi"}],"key":"ACoyGrgLtW"},{"type":"text","value":" value achievable from ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"wJC4VIv4dk"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"sHkIoMt6k1"},{"type":"text","value":" so far.\nThat is, the value of ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"MEG2nfpB7E"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"sCdMryxVzP"},{"type":"text","value":" is ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"f2p59lnNpT"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"rU2FpPBddy"}],"key":"w6T45H4yEa"},{"type":"text","value":" ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"l2CD61fhWV"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"h3H85eDUTD"},{"type":"text","value":".\nSuppose Max chooses action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"OVO7Jchd7O"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"Xccld5vQt1"},{"type":"text","value":", which leads to state ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"y8fmcKrFKE"},{"type":"inlineMath","value":"s'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"ss's","key":"qAZjz1Jy8G"},{"type":"text","value":", in which it is Min’s turn.\nIf any of Min’s actions in ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"t7jbTGzB2m"},{"type":"inlineMath","value":"s'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"ss's","key":"bjpc7JSohv"},{"type":"text","value":" achieve a value ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"LuL0KTfctl"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(s') \\le \\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"Vh+1(s)α(s)V^\\star_{\\hi+1}(s') \\le \\alpha(s)Vh+1(s)α(s)","key":"nGb4GZAWux"},{"type":"text","value":",\nwe know that Max would not choose action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"n9nPrgcKAe"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"i1QpxVhAzc"},{"type":"text","value":",\nsince they know that it is ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"XCXr0AQHHE"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"worse","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"hyPr0E9vDF"}],"key":"JCIt75ctSJ"},{"type":"text","value":" than whichever action gave the value ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"TnTOkeiLC6"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"y0GGKKhP65"},{"type":"text","value":".\nSimilarly, to evaluate a state on Min’s turn,\nwe update ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"p60h1wEK8r"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"qzBkQ2TiaX"},{"type":"text","value":" to be the ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"YYvakLwmXS"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"lowest","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"JLktkIgQ3I"}],"key":"kPlHGpECvw"},{"type":"text","value":" value achievable from ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"SbjDJs8puf"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"Ogeu4tWBgC"},{"type":"text","value":" so far.\nThat is, the value of ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"UZgcMppSwe"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"hgwJdhiTrE"},{"type":"text","value":" is ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"rtc6tldQdN"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"at most","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"F96U5XuK1B"}],"key":"QwKVoDcPnT"},{"type":"text","value":" ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"LoVrYFo1ZD"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"l7Um8QP99P"},{"type":"text","value":".\nSuppose Min chooses action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"YBMA9rHnb0"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"T9hAI0s3TH"},{"type":"text","value":",\nwhich leads to state ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"oEzpeV0hVX"},{"type":"inlineMath","value":"s'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"ss's","key":"GNFiEITQOv"},{"type":"text","value":" for Max.\nIf Max has any actions that do ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"B8uLq1tdag"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"better","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"XHokj86AlL"}],"key":"vEH5KVsIAC"},{"type":"text","value":" than ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"PpgKUriAbo"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"MUFfRbrZBa"},{"type":"text","value":",\nthey would take it,\nmaking action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"DvrG57l8Ld"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"xAIJt3nPvB"},{"type":"text","value":" a suboptimal choice for Min.","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"pR1LDwiR7i"}],"key":"kxgYj5OXze"},{"type":"proof","kind":"example","label":"alpha-beta-example","identifier":"alpha-beta-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Alpha-beta search for a simple game","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"pE5PelfBmq"}],"key":"nqRIm2iIz5"},{"type":"paragraph","position":{"start":{"line":269,"column":1},"end":{"line":273,"column":1}},"children":[{"type":"text","value":"Let us use the same simple game from ","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"zfLgNHpwt7"},{"type":"crossReference","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"children":[{"type":"text","value":"Example ","key":"PS1pPtDZV6"},{"type":"text","value":"8.1","key":"hvFCEE3RvP"}],"identifier":"min-max-example","label":"min-max-example","kind":"proof:example","template":"Example %s","enumerator":"8.1","resolved":true,"html_id":"min-max-example","key":"xWcWOdxMQ0"},{"type":"text","value":".\nWe list the values of ","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"rzgtC9o3nn"},{"type":"inlineMath","value":"\\alpha(s), \\beta(s)","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":"α(s),β(s)\\alpha(s), \\beta(s)α(s),β(s)","key":"tIIptduMp6"},{"type":"text","value":" in each node throughout the algorithm.\nThese values are initialized to ","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"WR7atgBCQD"},{"type":"inlineMath","value":"-\\infty, +\\infty","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":",+-\\infty, +\\infty,+","key":"MBLEAUWpRt"},{"type":"text","value":" respectively.\nWe shade any squares that have not been visited by the algorithm,\nand we assume that actions are evaluated from left to right.","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"JzYjrcbU73"}],"key":"amTns9nhPA"},{"type":"image","url":"/build/alpha-beta-0-7ad590b6317a7a6f64b4e368eda30e33.png","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"RIA11m51dx","urlSource":"./shared/alpha-beta-0.png","urlOptimized":"/build/alpha-beta-0-7ad590b6317a7a6f64b4e368eda30e33.webp"},{"type":"paragraph","position":{"start":{"line":277,"column":1},"end":{"line":280,"column":1}},"children":[{"type":"text","value":"Suppose Max takes action A. Let ","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"PsgFxwGC7x"},{"type":"inlineMath","value":"s'","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"ss's","key":"uFOsKFFVjV"},{"type":"text","value":" be the resulting game state.\nThe values of ","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"BM2WzFseVD"},{"type":"inlineMath","value":"\\alpha(s')","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"α(s)\\alpha(s')α(s)","key":"YxHsehyqms"},{"type":"text","value":" and ","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"c1zdnOtWdK"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"oqU3yIrftS"},{"type":"text","value":"\nare initialized at the same values as the root state,\nsince we want to prune a subtree if there exists a better action at any step higher in the tree.","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"LO8DzgUVvH"}],"key":"lRLLsnmpzk"},{"type":"image","url":"/build/alpha-beta-1-b9d0c4a2b1ab3150a403c943682c4a80.png","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"pr12oZaFh9","urlSource":"./shared/alpha-beta-1.png","urlOptimized":"/build/alpha-beta-1-b9d0c4a2b1ab3150a403c943682c4a80.webp"},{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":285,"column":1}},"children":[{"type":"text","value":"Then we iterate through Min’s possible actions,\nupdating the value of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"hz5PrqTWdU"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"a1IAlckrBW"},{"type":"text","value":" as we go.","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"qeDYFdik3X"}],"key":"ZJO2dhL7OV"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-2-b0d0597f3562685a2759d1d56f661682.png","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"u50H0sIsBj","urlSource":"./shared/alpha-beta-2.png","urlOptimized":"/build/alpha-beta-2-b0d0597f3562685a2759d1d56f661682.webp"},{"type":"text","value":"\n","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"PCygiCMW5e"},{"type":"image","url":"/build/alpha-beta-3-fcd7a3fcb02f86c22e47c8168d151549.png","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"mPyumquQ8b","urlSource":"./shared/alpha-beta-3.png","urlOptimized":"/build/alpha-beta-3-fcd7a3fcb02f86c22e47c8168d151549.webp"}],"key":"H3aYhrO4lR"},{"type":"paragraph","position":{"start":{"line":290,"column":1},"end":{"line":292,"column":1}},"children":[{"type":"text","value":"Once the value of state ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"KhrHCEnoRx"},{"type":"inlineMath","value":"s'","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"html":"ss's","key":"diCXohMQrY"},{"type":"text","value":" is fully evaluated,\nwe know that Max can achieve a value of ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"VCcgIhD2XK"},{"type":"emphasis","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"NeJZHae4IX"}],"key":"f2PcP7vZqg"},{"type":"text","value":" ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"dAQk9QlMMI"},{"type":"text","value":"-2","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"CQFjqsn46Z"},{"type":"text","value":" starting from the root,\nand so we update ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"FZTjPksKLq"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"BfwL4k2ayP"},{"type":"text","value":", where ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"M497DW703W"},{"type":"inlineMath","value":"s","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"html":"sss","key":"etTmb5L4wn"},{"type":"text","value":" is the root state:","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"e6pTHrgtxd"}],"key":"FgI0ch9l03"},{"type":"image","url":"/build/alpha-beta-4-e3958ef0c8cbcb3b559e8a63d1cc1e6b.png","position":{"start":{"line":294,"column":1},"end":{"line":294,"column":1}},"key":"EcNf9eN1OY","urlSource":"./shared/alpha-beta-4.png","urlOptimized":"/build/alpha-beta-4-e3958ef0c8cbcb3b559e8a63d1cc1e6b.webp"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":297,"column":1}},"children":[{"type":"text","value":"Then Max imagines taking action B. Again, let ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"wuCC0tnLVq"},{"type":"inlineMath","value":"s'","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"html":"ss's","key":"CoyvvY8xqk"},{"type":"text","value":" denote the resulting game state.\nWe initialize ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"k7l21aeHHA"},{"type":"inlineMath","value":"\\alpha(s')","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"html":"α(s)\\alpha(s')α(s)","key":"GalhcMByqk"},{"type":"text","value":" and ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"wuUUZqAAdY"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"PMKKSIo7id"},{"type":"text","value":" from the root:","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"A8mYMDxWSe"}],"key":"XkVgTFvxIE"},{"type":"image","url":"/build/alpha-beta-5-f16710428d22fbb7c1a5dbc054a71a7c.png","position":{"start":{"line":299,"column":1},"end":{"line":299,"column":1}},"key":"vG79rIHSVW","urlSource":"./shared/alpha-beta-5.png","urlOptimized":"/build/alpha-beta-5-f16710428d22fbb7c1a5dbc054a71a7c.webp"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"Now suppose Min takes action D, resulting in a value of ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"lipUYAHKX0"},{"type":"text","value":"-3","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"SIAsRNZdHp"},{"type":"text","value":".\nWe see that ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"JGSkA854Sa"},{"type":"inlineMath","value":"V^\\star_\\hi(s') = \\min(-3, x, y)","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"Vh(s)=min(3,x,y)V^\\star_\\hi(s') = \\min(-3, x, y)Vh(s)=min(3,x,y)","key":"ydxXJmA978"},{"type":"text","value":",\nwhere ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"REcB8JO4O0"},{"type":"inlineMath","value":"x","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"xxx","key":"o07Qj9AJ27"},{"type":"text","value":" and ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"A87VqSLS8S"},{"type":"inlineMath","value":"y","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"yyy","key":"j7f4BUr3GR"},{"type":"text","value":" are the values of the remaining two actions.\nBut since ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"qtpr2DDPgE"},{"type":"inlineMath","value":"\\min(-3, x, y) \\le -3","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"min(3,x,y)3\\min(-3, x, y) \\le -3min(3,x,y)3","key":"edAUCcrmbJ"},{"type":"text","value":",\nwe know that the value of ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"w1iz31qpCK"},{"type":"inlineMath","value":"s'","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"ss's","key":"trUAhbujdU"},{"type":"text","value":" is at most ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Gemk1rlqIb"},{"type":"text","value":"-3","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"RKQoLl6SPf"},{"type":"text","value":".\nBut Max can achieve a better value of ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"utmHB1Hh9z"},{"type":"inlineMath","value":"\\alpha(s') = -2","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"α(s)=2\\alpha(s') = -2α(s)=2","key":"AGhopR5S3k"},{"type":"text","value":" by taking action A,\nand so Max will never take action B,\nand we can prune the search here.\nWe will use dotted lines to indicate states that have been ruled out from the search:","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"MXYfIhyQZc"}],"key":"TBMYCAD8Z9"},{"type":"image","url":"/build/alpha-beta-6-1f7516f925d212dc9290ccf221a7d28e.png","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"OkWBi60rBF","urlSource":"./shared/alpha-beta-6.png","urlOptimized":"/build/alpha-beta-6-1f7516f925d212dc9290ccf221a7d28e.webp"},{"type":"paragraph","position":{"start":{"line":313,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Finally, suppose Max takes action C.\nFor Min’s actions D and E,\nthere is still a chance that action C might outperform action A,\nso we continue expanding:","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"A3bNfrkZfm"}],"key":"vlPh2hjDlj"},{"type":"paragraph","position":{"start":{"line":318,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-7-648c7023e2fdb207fac5a83dbd8abd64.png","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"UxzmOiBJYG","urlSource":"./shared/alpha-beta-7.png","urlOptimized":"/build/alpha-beta-7-648c7023e2fdb207fac5a83dbd8abd64.webp"},{"type":"text","value":"\n","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"I73mjAqYBa"},{"type":"image","url":"/build/alpha-beta-8-fb8654bf1f1f361f3098f7a2c0ace9bd.png","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"hRWob9rcj8","urlSource":"./shared/alpha-beta-8.png","urlOptimized":"/build/alpha-beta-8-fb8654bf1f1f361f3098f7a2c0ace9bd.webp"}],"key":"uRHqxKo5ZE"},{"type":"paragraph","position":{"start":{"line":321,"column":1},"end":{"line":323,"column":1}},"children":[{"type":"text","value":"Finally, we see that Min taking action F achieves the minimum value at this state.\nThis shows that optimal play is for Max to take action C,\nand Min to take action F.","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"vPstVy1zVi"}],"key":"sCWHZQzdTP"},{"type":"image","url":"/build/alpha-beta-9-f7d61365563b59cdcecc22ca3e301bc6.png","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"key":"wYv4HbJAZx","urlSource":"./shared/alpha-beta-9.png","urlOptimized":"/build/alpha-beta-9-f7d61365563b59cdcecc22ca3e301bc6.webp"}],"enumerator":"8.2","html_id":"alpha-beta-example","key":"AgNKChUzUM"},{"type":"code","lang":"python","value":"def alpha_beta_search(s, player, alpha, beta) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min, alpha, beta)\n if v > v_max:\n a_max, v_max = a, v\n alpha = max(alpha, v)\n if v_max >= beta:\n # we know Min will not choose the action that leads to this state\n return a_max, v_max\n return a_max, v_max\n\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n beta = min(beta, v)\n if v_min <= alpha:\n # we know Max will not choose the action that leads to this state\n return a_min, v_min\n return a_min, v_min","position":{"start":{"line":329,"column":1},"end":{"line":358,"column":1}},"key":"KAffZzDgRj"},{"type":"paragraph","position":{"start":{"line":360,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"How do we choose what ","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"SEIVOecYeJ"},{"type":"emphasis","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"text","value":"order","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"s991OJZSua"}],"key":"QgoPwDDD4D"},{"type":"text","value":" to explore the branches?\nAs you can tell, this significantly affects the efficiency of the pruning algorithm.\nIf Max explores the possible actions in order from worst to best,\nthey will not be able to prune any branches at all!\nAdditionally, to verify that an action is suboptimal,\nwe must run the search recursively from that action,\nwhich ultimately requires traversing the tree all the way to a leaf node.\nThe longer the game might possibly last,\nthe more computation we have to run.","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"plUuz1qtwi"}],"key":"CY0biH1hEy"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"In practice, we can often use background information about the game to develop a ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"hdK7vkPOMS"},{"type":"strong","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"heuristic","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"pjDpIsyMDI"}],"key":"f1SCaDyG9W"},{"type":"text","value":" for evaluating possible actions.\nIf a technique is based on background information or intuition,\nespecially if it isn’t rigorously justified,\nwe call it a heuristic.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"gGdPCYtiPn"}],"key":"uIXPSOVkSa"},{"type":"paragraph","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"children":[{"type":"text","value":"Can we develop ","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"key":"bT3f67dCpy"},{"type":"emphasis","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"children":[{"type":"text","value":"heuristic methods","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"key":"q3nk5Oe3vl"}],"key":"G5UswGVLLG"},{"type":"text","value":" for tree exploration that works for all sorts of games?","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"key":"lU72GEpAN4"}],"key":"GjpDWOYtRc"},{"type":"comment","value":" Here's where we can incorporate the _reinforcement learning_ ","key":"g92yfup9Pd"},{"type":"heading","depth":2,"position":{"start":{"line":379,"column":1},"end":{"line":379,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":379,"column":1},"end":{"line":379,"column":1}},"key":"nXhT8RxuUq"}],"label":"monte-carlo-tree-search","identifier":"monte-carlo-tree-search","html_id":"monte-carlo-tree-search","enumerator":"8.5","key":"FbP3Bp9e6b"},{"type":"paragraph","position":{"start":{"line":381,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"The task of evaluating actions in a complex environment might seem familiar.\nWe’ve encountered this problem before in both the ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"ogtsd0K5Qv"},{"type":"link","url":"/bandits","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"o39QRa9uc3"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"rsINn3ke33"},{"type":"text","value":" setting and the ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"f7ofTo1UxQ"},{"type":"link","url":"/mdps","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"children":[{"type":"text","value":"Markov decision process","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"wyaRqH1V3K"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"FHrUWe9L8L"},{"type":"text","value":" setting.\nNow we’ll see how to combine concepts from these to form a more general and efficient tree search heuristic called ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"xlvKQaIBjM"},{"type":"strong","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"Pe7TqUJL7e"}],"key":"NuNb0SVlos"},{"type":"text","value":" (MCTS).","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"EPqOfskT4U"}],"key":"TTVB0yqK9w"},{"type":"paragraph","position":{"start":{"line":385,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"When a problem is intractable to solve ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"O5VrfZNFpK"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"Zf3VWebHIE"}],"key":"NWFuFsXKDi"},{"type":"text","value":",\nwe often turn to ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"E54PyqejaX"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"g0Z68AjpOV"}],"key":"vwsCnYEB8z"},{"type":"text","value":" algorithms that sacrifice some accuracy in exchange for computational efficiency.\nMCTS also improves on alpha-beta search in this sense.\nAs the name suggests,\nMCTS uses ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"YRBaY3jmnn"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"Monte Carlo","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"FUCHCVC6Yn"}],"key":"Bk9UTFFqvX"},{"type":"text","value":" simulation, that is, collecting random samples and computing the sample statistics,\nin order to ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"qg9wHYfPu9"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"JBhLFK7dgx"}],"key":"rTBDQUvEs6"},{"type":"text","value":" the value of each action.","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"pXNGzAz1aY"}],"key":"eUScOehWtJ"},{"type":"paragraph","position":{"start":{"line":392,"column":1},"end":{"line":398,"column":1}},"children":[{"type":"text","value":"As before, we imagine a complete game tree in which each path represents an ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"XwgFoCkiha"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"entire game","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"QokmfwVKw8"}],"key":"qIY8HDrZqA"},{"type":"text","value":".\nThe goal of MCTS is to assign values to only the game states that are ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"i2uYyXbLrg"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"relevant","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"z6BbMnR6zO"}],"key":"iQXbSStSiX"},{"type":"text","value":" to the ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"ZnEssV1AvR"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"current game","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"Mg6yGFhHwy"}],"key":"sQW5RQJ4cx"},{"type":"text","value":";\nWe gradually expand the tree at each move.\nFor comparison, in alpha-beta search,\nthe entire tree only needs to be solved ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"aonwzB2xrV"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"once","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"m1B3NEfDpp"}],"key":"rmuwcluHXP"},{"type":"text","value":",\nand from then on,\nchoosing an action is as simple as taking a maximum over the previously computed values.","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"fNZ07piHD0"}],"key":"lmNk8rG9ta"},{"type":"paragraph","position":{"start":{"line":400,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"The crux of MCTS is approximating the win probability of a state by a ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"i664pzIidJ"},{"type":"emphasis","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"sample probability","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"MpPJw5Tmzq"}],"key":"hV4ZSxngEJ"},{"type":"text","value":".\nIn practice, MCTS is used for games with ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"LOxpwTqqU3"},{"type":"emphasis","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"binary outcomes","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"Y9lqsMm7jf"}],"key":"nQI1BdWxha"},{"type":"text","value":" where ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"ETepI8yKIG"},{"type":"inlineMath","value":"r(s) \\in \\{ +1, -1 \\}","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"r(s){+1,1}r(s) \\in \\{ +1, -1 \\}r(s){+1,1}","key":"P9BgkaQ7dD"},{"type":"text","value":",\nand so this is equivalent to approximating the final game score.\nTo approximate the win probability from state ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"atTU4IpAAi"},{"type":"inlineMath","value":"s","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"sss","key":"rC7R7K6Glz"},{"type":"text","value":",\nMCTS samples random games starting in ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"ANDzS1oYnX"},{"type":"inlineMath","value":"s","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"sss","key":"v6wFC87tcZ"},{"type":"text","value":" and computes the sample proportion of those that the player wins.","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"qrHTGeaWfH"}],"key":"KowrZ1Vo1e"},{"type":"paragraph","position":{"start":{"line":406,"column":1},"end":{"line":410,"column":1}},"children":[{"type":"text","value":"Note that, for a given state ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"rjAdbn0Qiy"},{"type":"inlineMath","value":"s","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"html":"sss","key":"YMWs7MX4Wf"},{"type":"text","value":",\nchoosing the best action ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"TgXrNi90lB"},{"type":"inlineMath","value":"a","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"html":"aaa","key":"pFqnU4ergC"},{"type":"text","value":" can be framed as a ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"le2zKc3zl0"},{"type":"link","url":"/bandits","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"TMDathHs4R"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"rikNVGErxr"},{"type":"text","value":" problem,\nwhere each action corresponds to an arm,\nand the reward distribution of arm ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"cVFH0vK14a"},{"type":"inlineMath","value":"k","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"html":"kkk","key":"G99Rv2RJ0x"},{"type":"text","value":" is the distribution of the game score over random games after choosing that arm.\nThe most commonly used bandit algorithm in practice for MCTS is the ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"bOj1jVtMhJ"},{"type":"crossReference","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"ERzHy7i6WT"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"LWyI17x9kT"},{"type":"text","value":" algorithm.","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"XJci0Ui91y"}],"key":"g0XQeg3fFw"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Summary of UCB","position":{"start":{"line":412,"column":1},"end":{"line":412,"column":1}},"key":"KmgstOTuX8"}],"key":"rqI2cUvBy4"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"Let us quickly review the UCB bandit algorithm.\nFor each arm ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"pv5PdLiqXo"},{"type":"inlineMath","value":"k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"kkk","key":"NnSrEv2zrx"},{"type":"text","value":", we track the sample mean","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"tMpmDeV9ZK"}],"key":"frm42KwRmL"},{"type":"math","value":"\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tau","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"tight":true,"html":"μ^tk=1Ntkτ=0t11{aτ=k}rτ\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tauμ^tk=Ntk1τ=0t11{aτ=k}rτ","enumerator":"8.2","key":"mC6G79ixum"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"of all rewards from that arm up to time ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"CoNV0dWATO"},{"type":"inlineMath","value":"t","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"ttt","key":"TSnFgFV018"},{"type":"text","value":".\nThen we construct a ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"lHfK7DKZec"},{"type":"emphasis","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"confidence interval","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"erLBofuAIP"}],"key":"DfFeoIV7l2"},{"type":"text","value":"","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"aOHm3964mP"}],"key":"RAwaDczIwb"},{"type":"math","value":"C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"tight":true,"html":"Ctk=[μ^tkBtk,μ^tk+Btk],C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],Ctk=[μ^tkBtk,μ^tk+Btk],","enumerator":"8.3","key":"Xd2wQ7M6xo"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"GfUiOdVFfG"},{"type":"inlineMath","value":"B_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"Btk=ln(2t/δ)2NtkB_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}Btk=2Ntkln(2t/δ)","key":"BtuqGnqHhA"},{"type":"text","value":" is given by Hoeffding’s inequality,\nso that with probability ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"JPcn950d1V"},{"type":"text","value":"δ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"t4fdAzHXxi"},{"type":"text","value":" (some fixed parameter we choose),\nthe true mean ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"ayiM9EGduk"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"μk\\mu^kμk","key":"wm33TFIvjD"},{"type":"text","value":" lies within ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"YaL7Z4piEx"},{"type":"inlineMath","value":"C_t^k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"CtkC_t^kCtk","key":"bKJrbLBf2E"},{"type":"text","value":".\nNote that ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"O1bMRY2HW8"},{"type":"inlineMath","value":"B_t^k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"BtkB_t^kBtk","key":"BC1U2SnZBp"},{"type":"text","value":" scales like ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"rnRxpEQG49"},{"type":"inlineMath","value":"\\sqrt{1/N^k_t}","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"1/Ntk\\sqrt{1/N^k_t}1/Ntk","key":"qT9scFyqXQ"},{"type":"text","value":",\ni.e. the more we have visited that arm,\nthe more confident we get about it,\nand the narrower the confidence interval.","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"caRzH6RHQw"}],"key":"RXiLZFQNR6"},{"type":"paragraph","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"children":[{"type":"text","value":"To select an arm, we pick the arm with the highest ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"hFxM2bqUCt"},{"type":"emphasis","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"fTQxYvluQg"}],"key":"to490EGUyi"},{"type":"text","value":".","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"nNVJr3KASG"}],"key":"AjMvtNc4OP"}],"key":"TMphx9ClWP"},{"type":"paragraph","position":{"start":{"line":430,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"text","value":"This means that, for each edge (corresponding to a state-action pair ","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"key":"qDhfcrlxqc"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"wS4uSw1GnX"},{"type":"text","value":") in the game tree,\nwe keep track of the statistics required to compute its UCB:","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"key":"tvSLLWH4Q5"}],"key":"TKQFzF9r5K"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":433,"column":1},"end":{"line":436,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"children":[{"type":"text","value":"How many times it has been “visited” (","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"J9FJTCw4yO"},{"type":"inlineMath","value":"N_t^{s, a}","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"html":"Nts,aN_t^{s, a}Nts,a","key":"lqIM3sGGmK"},{"type":"text","value":")","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"BUorh5sxop"}],"key":"dvtG428r1S"},{"type":"listItem","spread":true,"position":{"start":{"line":434,"column":1},"end":{"line":436,"column":1}},"children":[{"type":"text","value":"How many of those visits resulted in victory (","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"key":"FKpuMKJot1"},{"type":"inlineMath","value":"\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tau","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"html":"τ=0t11{(sτ,aτ)=(s,a)}rτ\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tauτ=0t11{(sτ,aτ)=(s,a)}rτ","key":"dHMtnC81BX"},{"type":"text","value":").\nLet us call this latter value ","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"key":"F9VCi3Hqy3"},{"type":"inlineMath","value":"W^{s, a}_t","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"html":"Wts,aW^{s, a}_tWts,a","key":"Ix12nUmb8O"},{"type":"text","value":" (for number of “wins”).","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"key":"jhDaA0o12C"}],"key":"keFwTyGX9O"}],"key":"jtLvaKv4Yz"},{"type":"paragraph","position":{"start":{"line":437,"column":1},"end":{"line":444,"column":1}},"children":[{"type":"text","value":"What does ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"UvXNQD5Kqw"},{"type":"inlineMath","value":"t","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"ttt","key":"kcSfHXsgIN"},{"type":"text","value":" refer to in the above expressions?\nRecall ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"AKDLTnPHMB"},{"type":"inlineMath","value":"t","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"ttt","key":"Fa4V9yjtBm"},{"type":"text","value":" refers to the number of time steps elapsed in the ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"AnVzYcQEWA"},{"type":"emphasis","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"children":[{"type":"text","value":"bandit environment","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"Rh3XhLaNKY"}],"key":"A3aAb94gIx"},{"type":"text","value":".\nAs mentioned above,\neach state ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"uTLgA7Rlef"},{"type":"inlineMath","value":"s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"sss","key":"VtZyBP3Nkw"},{"type":"text","value":" corresponds to its own bandit environment,\nand so ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"oheZd6A5vf"},{"type":"inlineMath","value":"t","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"ttt","key":"SBLbcgjCGQ"},{"type":"text","value":" refers to ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"tcgIAImVRK"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"NsN^sNs","key":"pCG4xSBDwa"},{"type":"text","value":", that is,\nhow many actions have been taken from state ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"BtBeAQO5cZ"},{"type":"inlineMath","value":"s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"sss","key":"lGxQOXCazP"},{"type":"text","value":".\nThis term, ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"KGCMOw0tN9"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"NsN^sNs","key":"O3tFxItxpk"},{"type":"text","value":", gets incremented as the algorithm runs;\nfor simplicity, we won’t introduce another index to track how it changes.","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"W6BM4JBygg"}],"key":"rrgpMT0MEE"},{"type":"proof","kind":"algorithm","label":"mcts-algorithm","identifier":"mcts-algorithm","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search algorithm","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"PRMfrNr90G"}],"key":"aP9jyTlc7p"},{"type":"paragraph","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"qK4b64UHCV"}],"key":"LRlN4IvWLo"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":450,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"html":"TTT","key":"wJomc7l7pu"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"key":"MZUjinHLrU"}],"key":"u5NBjM8zre"},{"type":"listItem","spread":true,"position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_{\\text{rollout}}","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"πrollout\\pi_{\\text{rollout}}πrollout","key":"bQkJXmY3bE"},{"type":"text","value":", the ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"LClaL58Pnz"},{"type":"strong","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"text","value":"rollout policy","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"FhRtiydxAA"}],"key":"YZBNHLZpac"},{"type":"text","value":" for randomly sampling games","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"dl5qHGPxOO"}],"key":"ylbWGRylmW"},{"type":"listItem","spread":true,"position":{"start":{"line":452,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"html":"ccc","key":"AYuGWZXqaq"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"key":"V3pZBgjT0Y"}],"key":"n7jeO7Jhd6"}],"key":"dhI7KQbujI"},{"type":"paragraph","position":{"start":{"line":454,"column":1},"end":{"line":458,"column":1}},"children":[{"type":"text","value":"To choose a single move starting at state ","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"Me7DxdSfpL"},{"type":"inlineMath","value":"s_{\\text{start}}","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"html":"sstarts_{\\text{start}}sstart","key":"X8pFWwvtOJ"},{"type":"text","value":",\nMCTS first tries to estimate the UCB values for each of the possible actions ","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"TOnGopHFD8"},{"type":"inlineMath","value":"\\mathcal{A}(s_\\text{start})","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"html":"A(sstart)\\mathcal{A}(s_\\text{start})A(sstart)","key":"zAKWD8gWsZ"},{"type":"text","value":",\nand then chooses the best one.\nTo estimate the UCB values,\nit repeats the following four steps ","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"seN6iOj5BH"},{"type":"inlineMath","value":"T","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"html":"TTT","key":"m8jzxp6jZH"},{"type":"text","value":" times:","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"hrsxRObqDc"}],"key":"YJX7AMC5TF"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":460,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":460,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"strong","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"cdoKPqm9Ld"}],"key":"OlhaKVVBsf"},{"type":"text","value":": We start at ","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"Fomb2EtsMS"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"Ri6nPL3XrC"},{"type":"text","value":". Let ","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"CCFOfI3V7m"},{"type":"text","value":"τ","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"ifotvn5WyN"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"nTOEVqiypO"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":461,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":461,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"VnI32V3jl7"},{"type":"inlineMath","value":"s","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"html":"sss","key":"QF5Cs7VOux"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"or4lSqeElp"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":462,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":462,"column":1},"end":{"line":466,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"k2wR72hyIU"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"gvjYz9T6dg"},{"type":"text","value":", where\n","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"lq8LffXgHF"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"identifier":"ucb-tree","label":"ucb-tree","html_id":"ucb-tree","html":"UCBs,a=Ws,aNs+clnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cNs,alnNs","enumerator":"8.4","key":"wOLRc3XOqD"}],"key":"zioIV3B4RG"},{"type":"listItem","spread":true,"position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"FCvAC5RrZv"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"l0ladSZffT"},{"type":"text","value":" to ","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"Vb57uoZHPR"},{"type":"text","value":"τ","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"PeOBhhcy9L"}],"key":"uERO4YggTm"},{"type":"listItem","spread":true,"position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"key":"Eg3ijufVLs"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"dyQstCoX49"}],"key":"xWV5AYuuDT"}],"key":"x7ZjvEyHTE"}],"key":"pQwJrtNK0h"}],"key":"eIZqnABJT8"}],"key":"S0KSyL1MhQ"},{"type":"listItem","spread":true,"position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"children":[{"type":"strong","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"xrHvgrP1vt"}],"key":"XE6myypQae"},{"type":"text","value":": Let ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"AVKOP0Ka9H"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"html":"snews_\\text{new}snew","key":"HAGCVLL9xZ"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"nAz9mw7YOx"},{"type":"text","value":"τ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"LhtffDJFFu"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"QLMIiPxWdh"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"html":"snews_\\text{new}snew","key":"FIbNGETyaj"},{"type":"text","value":". Call it ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"u7EceRw6iQ"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"html":"anewa_{\\text{new}}anew","key":"F4AOoYPnd8"},{"type":"text","value":". Add it to ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"jF1UaKMC8p"},{"type":"text","value":"τ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"gbb5xnwF4s"},{"type":"text","value":".","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"wvANId0STJ"}],"key":"B6erRIkNwg"},{"type":"listItem","spread":true,"position":{"start":{"line":470,"column":1},"end":{"line":472,"column":1}},"children":[{"type":"strong","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"v13xPb4bSQ"}],"key":"GrHisN7AVk"},{"type":"text","value":": Simulate a complete game episode by starting with the action ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"hwuQHXBH9b"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"anewa_{\\text{new}}anew","key":"OTKNC7PgPD"},{"type":"text","value":"\nand then playing according to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"rIpOL5uh5Z"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"EBr5Qilw8O"},{"type":"text","value":".\nThis results in the outcome ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"t1V2CRnQja"},{"type":"inlineMath","value":"r \\in \\{ +1, -1 \\}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"r{+1,1}r \\in \\{ +1, -1 \\}r{+1,1}","key":"ADlyC8j51c"},{"type":"text","value":".","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"KvMz82KGrb"}],"key":"Ovo5pzF20W"},{"type":"listItem","spread":true,"position":{"start":{"line":473,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"strong","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"key":"pyBarI5Asy"}],"key":"Xh3m19cz2F"},{"type":"text","value":": For each ","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"key":"sAeH70sFvV"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"m7JWcGOTty"},{"type":"text","value":":","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"key":"u8t37ikwL0"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":474,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":474,"column":1},"end":{"line":474,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":474,"column":1},"end":{"line":474,"column":1}},"key":"F8T9hP3HWw"},{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":474,"column":1},"end":{"line":474,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"O1QgqpHfzC"}],"key":"j4qRPCRuQV"},{"type":"listItem","spread":true,"position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"MvOXCay66m"}],"key":"ZgNH3aQgOn"},{"type":"listItem","spread":true,"position":{"start":{"line":476,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"v7am60ZzrI"},{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"bABnp5Z6EX"}],"key":"lzbTMeH8vJ"}],"key":"yfqieVNOEV"}],"key":"vfzbIKpuTm"}],"key":"KhdwYhTRf8"},{"type":"paragraph","position":{"start":{"line":478,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"After ","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"vXkcqRIv6H"},{"type":"inlineMath","value":"T","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"html":"TTT","key":"pjlGOtR4ZQ"},{"type":"text","value":" repeats of the above,\nwe return the action with the highest UCB value ","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"m6ea0SJPmh"},{"type":"crossReference","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"text","value":"(","key":"aLQC2m3c5b"},{"type":"text","value":"8.4","key":"qe55Y3md21"},{"type":"text","value":")","key":"D0c0aqC2aT"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"RFM8PPd3Z1"},{"type":"text","value":".\nThen play continues.","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"KYzP6bGx59"}],"key":"wbtY1Y2buT"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":483,"column":1}},"children":[{"type":"text","value":"Between turns, we can keep the subtree whose statistics we have visited so far.\nHowever, the rest of the tree for the actions we did ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"X2NcMHFGxE"},{"type":"emphasis","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"TiGLJAwhty"}],"key":"BtSIYYL99N"},{"type":"text","value":" end up taking gets discarded.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"V4I78GH3e5"}],"key":"NctEPWr98D"}],"enumerator":"8.2","html_id":"mcts-algorithm","key":"N0eDpw3XVf"},{"type":"paragraph","position":{"start":{"line":486,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"text","value":"The application which brought the MCTS algorithm to fame was DeepMind’s ","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"zNw6wpVHk6"},{"type":"strong","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"children":[{"type":"text","value":"AlphaGo","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"cy45ezXrjQ"}],"key":"S1SDzS7Lv9"},{"type":"text","value":" ","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"hksqpprIpe"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"pfVH4VZHM7"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"ZAOMwJhzdk"}],"key":"naHv5xOjqT"},{"type":"text","value":" (2016)","key":"TdGB1sBL9e"}],"enumerator":"1","key":"Ha6U0MIIEE"},{"type":"text","value":".\nSince then, it has been used in numerous applications ranging from games to automated theorem proving.","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"r01wiOeR7u"}],"key":"wUyusf8Jy8"},{"type":"paragraph","position":{"start":{"line":489,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"How accurate is this Monte Carlo estimation?\nIt depends heavily on the rollout policy ","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"FQB4MdDO0x"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"SQQiqsKexO"},{"type":"text","value":".\nIf the distribution ","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"NwXWmgQObU"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"SdnvhHvfSv"},{"type":"text","value":" induces over games is very different from the distribution seen during real gameplay,\nwe might end up with a poor value approximation.","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"V8SWP7QF4N"}],"key":"MJm8kEaxWZ"},{"type":"heading","depth":3,"position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"children":[{"type":"text","value":"Incorporating value functions and policies","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"key":"hWSKFZdf7H"}],"identifier":"incorporating-value-functions-and-policies","label":"Incorporating value functions and policies","html_id":"incorporating-value-functions-and-policies","implicit":true,"enumerator":"8.5.1","key":"YKVkYT1GwR"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"text","value":"To remedy this,\nwe might make use of a value function ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"U1xJkH7ip5"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"HJcIzAzzY0"},{"type":"text","value":" that more efficiently approximates the value of a state.\nThen, we can replace the simulation step of ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"XcaIIo73L4"},{"type":"crossReference","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"MCTS","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"hLuTDjpChe"}],"identifier":"mcts-algorithm","label":"mcts-algorithm","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.2","resolved":true,"html_id":"mcts-algorithm","key":"oYsj4bTTob"},{"type":"text","value":" with evaluating ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"l6vxSQG9Pu"},{"type":"inlineMath","value":"r = v(s_\\text{next})","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"r=v(snext)r = v(s_\\text{next})r=v(snext)","key":"y73w3deMcl"},{"type":"text","value":", where ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"HUJVK0UiDI"},{"type":"inlineMath","value":"s_\\text{next} = P(s_\\text{new}, a_\\text{new})","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"snext=P(snew,anew)s_\\text{next} = P(s_\\text{new}, a_\\text{new})snext=P(snew,anew)","key":"Okg73y9bze"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"PB4RY0kYU6"}],"key":"WB8iNbYGUh"},{"type":"paragraph","position":{"start":{"line":500,"column":1},"end":{"line":501,"column":1}},"children":[{"type":"text","value":"We might also make use of a ","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"yPbylCIXv7"},{"type":"strong","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"“guiding” policy","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"FSVga7S9GL"}],"key":"RIGY5h0R0j"},{"type":"text","value":" ","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"KrxOabBsoG"},{"type":"inlineMath","value":"\\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"html":"πguide:S(A)\\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A})πguide:S(A)","key":"zD2PVxIzIL"},{"type":"text","value":" that provides “intuition” as to which actions are more valuable in a given state.\nWe can scale the exploration term of ","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"K0o5jRvgu5"},{"type":"crossReference","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"(","key":"wauaRVlwzU"},{"type":"text","value":"8.4","key":"ndCXERut6O"},{"type":"text","value":")","key":"ARYbCYgQAV"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"Wb6BcbJDdT"},{"type":"text","value":" according to the policy’s outputs.","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"oTI8YfXzSv"}],"key":"LJhjyuiyRE"},{"type":"paragraph","position":{"start":{"line":503,"column":1},"end":{"line":504,"column":1}},"children":[{"type":"text","value":"Putting these together,\nwe can describe an updated version of MCTS that makes use of these value functions and policy:","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"WGgq3SuQyV"}],"key":"EcH9kDeWir"},{"type":"proof","kind":"algorithm","label":"mcts-policy-value","identifier":"mcts-policy-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search with policy and value functions","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"ZKmpUr42AK"}],"key":"uaMsJGlBl8"},{"type":"paragraph","position":{"start":{"line":509,"column":1},"end":{"line":509,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":509,"column":1},"end":{"line":509,"column":1}},"key":"kFKCSGKkSE"}],"key":"x8EKAR5aef"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":510,"column":1},"end":{"line":514,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"html":"TTT","key":"hCdfdHoNn0"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"UR4WQwYmZc"}],"key":"jA9bexQzJS"},{"type":"listItem","spread":true,"position":{"start":{"line":511,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"inlineMath","value":"v","position":{"start":{"line":511,"column":1},"end":{"line":511,"column":1}},"html":"vvv","key":"G8fmaifaY6"},{"type":"text","value":", a value function that evaluates how good a state is","position":{"start":{"line":511,"column":1},"end":{"line":511,"column":1}},"key":"jWcSyic7u6"}],"key":"sEVgCR9nNB"},{"type":"listItem","spread":true,"position":{"start":{"line":512,"column":1},"end":{"line":512,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":512,"column":1},"end":{"line":512,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"AFfjkf3cjp"},{"type":"text","value":", a guiding policy that encourages certain actions","position":{"start":{"line":512,"column":1},"end":{"line":512,"column":1}},"key":"xFxkdY5rBQ"}],"key":"mQ7ldmOEZc"},{"type":"listItem","spread":true,"position":{"start":{"line":513,"column":1},"end":{"line":514,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"html":"ccc","key":"rkkYJx0Tu6"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"UBZHR5oKft"}],"key":"u9Z64kEYHG"}],"key":"XvWICFvETA"},{"type":"paragraph","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"children":[{"type":"text","value":"To select a move in state ","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"key":"gTo3mgm9vD"},{"type":"inlineMath","value":"s_\\text{start}","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"html":"sstarts_\\text{start}sstart","key":"Kf9UzXDygY"},{"type":"text","value":", we repeat the following four steps ","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"key":"iHdMoKIGBU"},{"type":"inlineMath","value":"T","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"html":"TTT","key":"MH90S5NZ0P"},{"type":"text","value":" times:","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"key":"o2FgKBQ5H2"}],"key":"YnJ0SACc5r"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":517,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":517,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"strong","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"gZz3Z98KYk"}],"key":"h9mK7QTPnw"},{"type":"text","value":": We start at ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"ObPGSLQdxz"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"xRSQQNe0Cx"},{"type":"text","value":". Let ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"GhdZrg5uTm"},{"type":"text","value":"τ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"GAVUy3oTXr"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"XzLCXsZro5"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":518,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":518,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":518,"column":1},"end":{"line":518,"column":1}},"key":"jcmyLy692g"},{"type":"inlineMath","value":"s","position":{"start":{"line":518,"column":1},"end":{"line":518,"column":1}},"html":"sss","key":"UJVMtiEZno"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":518,"column":1},"end":{"line":518,"column":1}},"key":"bCDzsuiKpk"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":519,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":519,"column":1},"end":{"line":523,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"c5XdlHooTA"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"cIaEjKyQWU"},{"type":"text","value":", where\n","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"IspG3ZHtc6"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"identifier":"ucb-tree-policy","label":"ucb-tree-policy","html_id":"ucb-tree-policy","html":"UCBs,a=Ws,aNs+cπguide(as)lnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cπguide(as)Ns,alnNs","enumerator":"8.5","key":"v3Tkz2TZMV"}],"key":"uoLHn3p6r7"},{"type":"listItem","spread":true,"position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"zspRtxrW9y"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"vJeNsUV6ZZ"},{"type":"text","value":" to ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"oALyn113az"},{"type":"text","value":"τ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"rweaNG2emz"}],"key":"bjpWm5WecN"},{"type":"listItem","spread":true,"position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"DoVDFWqDjZ"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"yQG0ehWNhE"}],"key":"g9sOGe8emN"}],"key":"ZKdaeJASCt"}],"key":"L9fMnQ9Kn5"}],"key":"qKdPlUAdxh"}],"key":"xw666wBOoV"},{"type":"listItem","spread":true,"position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"strong","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"wirBJxeD1i"}],"key":"PlgOETq1Cf"},{"type":"text","value":": Let ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"MD9UvNiCU5"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"snews_\\text{new}snew","key":"lrrSvgywll"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"fKipzc8feI"},{"type":"text","value":"τ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"TcszPR3XUu"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"BG4H9akVFc"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"snews_\\text{new}snew","key":"tV8lPhSPK5"},{"type":"text","value":". Call it ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"V0438uWAOj"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"anewa_{\\text{new}}anew","key":"F11h3oid3P"},{"type":"text","value":". Add it to ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"Uqbhz1MatA"},{"type":"text","value":"τ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"M739mx6tlx"},{"type":"text","value":".","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"paUpCLLbQA"}],"key":"c5GXMA7e4e"},{"type":"listItem","spread":true,"position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"children":[{"type":"strong","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"yO1Ev16Bqc"}],"key":"DyjSmsOr5f"},{"type":"text","value":": Let ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"Az0ElW0EoV"},{"type":"inlineMath","value":"s_\\text{next} = P(s_\\text{new}, a_\\text{new})","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"snext=P(snew,anew)s_\\text{next} = P(s_\\text{new}, a_\\text{new})snext=P(snew,anew)","key":"bqoxyrwztE"},{"type":"text","value":". Evaluate ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"FS1qgfTWLK"},{"type":"inlineMath","value":"r = v(s_\\text{next})","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"r=v(snext)r = v(s_\\text{next})r=v(snext)","key":"BNuKAFAwgc"},{"type":"text","value":". This approximates the value of the game after taking the action ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"X9XETW2cPV"},{"type":"inlineMath","value":"a_\\text{new}","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"anewa_\\text{new}anew","key":"Ui676HHOpr"},{"type":"text","value":".","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"ANInzzTmdp"}],"key":"ZEipVCr0RI"},{"type":"listItem","spread":true,"position":{"start":{"line":528,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"strong","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"BVoKAWEuZL"}],"key":"ZlM4KnVcNG"},{"type":"text","value":": For each ","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"TeGLFDuN6F"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"Eju5AOR1bX"},{"type":"text","value":":","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"R3ry7oM83Z"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":529,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"children":[{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"yBx27BdnJ4"}],"key":"S1F6S8OHgC"},{"type":"listItem","spread":true,"position":{"start":{"line":530,"column":1},"end":{"line":530,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":530,"column":1},"end":{"line":530,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"nAmZgxrepP"}],"key":"yTtTOmcMjL"},{"type":"listItem","spread":true,"position":{"start":{"line":531,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"Y43sbpifQA"}],"key":"ZtE2XbRmnw"}],"key":"TS5ZUGQxlr"}],"key":"hkKOUrjA4Y"}],"key":"A0ye08IZ2Q"},{"type":"paragraph","position":{"start":{"line":533,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"We finally return the action with the highest UCB value ","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"key":"j0RtsNYB0L"},{"type":"crossReference","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"children":[{"type":"text","value":"(","key":"UCfRIYiJoF"},{"type":"text","value":"8.5","key":"Y0ITQrGww5"},{"type":"text","value":")","key":"dxKuTLKLjn"}],"identifier":"ucb-tree-policy","label":"ucb-tree-policy","kind":"equation","template":"(%s)","enumerator":"8.5","resolved":true,"html_id":"ucb-tree-policy","key":"zN5iKUWvYS"},{"type":"text","value":".\nThen play continues. As before, we can reuse the tree across timesteps.","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"key":"yIqFHl3X5J"}],"key":"CPlAEltDkb"}],"enumerator":"8.3","html_id":"mcts-policy-value","key":"Jr33vZOmtQ"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"How do we actually compute a useful ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"J01BILRqLA"},{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"Vlc9Dl018X"},{"type":"text","value":" and ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"onhsJEJdO5"},{"type":"inlineMath","value":"v","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"vvv","key":"pUTCA1fhph"},{"type":"text","value":"?\nIf we have some existing dataset of trajectories,\nwe could use ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"WyYwLqsvq8"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"gXnwvu8gVA"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"onBcmL4AWS"},{"type":"text","value":" (that is, imitation learning)\nto generate a policy ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"d9BhSGhveT"},{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"uXG3FEr9qe"},{"type":"text","value":" via behavioral cloning\nand learn ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"JpPow6oQXg"},{"type":"inlineMath","value":"v","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"vvv","key":"FKvomL8u1T"},{"type":"text","value":" by regressing the game outcomes onto states.\nThen, plugging these into ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"zR3DhjpB91"},{"type":"crossReference","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"the above algorithm","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"HEzDuXgBGC"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.3","resolved":true,"html_id":"mcts-policy-value","key":"mCY8k2AN1F"},{"type":"text","value":"\nresults in a stronger policy by using tree search to “think ahead”.","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"UKW736mLnH"}],"key":"eEKkvhurbu"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":546,"column":1}},"children":[{"type":"text","value":"But we don’t have to stop at just one improvement step;\nwe could iterate this process via ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"MBIEYET2Tg"},{"type":"strong","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"self-play","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"dOBZy4VWks"}],"key":"AzrlwR8WJP"},{"type":"text","value":".","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"sSiVv3unmq"}],"key":"xpkZI2osZx"},{"type":"heading","depth":3,"position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"children":[{"type":"text","value":"Self-play","position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"key":"cEKnQoBMJH"}],"identifier":"self-play","label":"Self-play","html_id":"self-play","implicit":true,"enumerator":"8.5.2","key":"svHQMoADmw"},{"type":"paragraph","position":{"start":{"line":550,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"ej9iHm9sBo"},{"type":"crossReference","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"temliDoAGv"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"Me55OFMB5t"},{"type":"text","value":" algorithm from the ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"iTWKD8Wc6u"},{"type":"link","url":"/mdps","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"MDPs","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"cycS6fT2h4"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"xwIdU3EQQ8"},{"type":"text","value":" chapter.\nPolicy iteration alternates between ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"ZPMudJjVf3"},{"type":"strong","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"eFAKPCtScv"}],"key":"ZzYhWf9KVf"},{"type":"text","value":" (taking ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"U2oL1iLExM"},{"type":"text","value":"π","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"nHEyckiiuv"},{"type":"text","value":" and computing ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"NoHR37CPQx"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"VπV^\\piVπ","key":"zK97l0p618"},{"type":"text","value":")\nand ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"sCfI8kC3NH"},{"type":"strong","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"policy improvement","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"I0ouASvtlB"}],"key":"jogRvxFsjf"},{"type":"text","value":" (setting ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"dV9BdLHwuL"},{"type":"text","value":"π","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"Vw0NAnBIXz"},{"type":"text","value":" to be greedy with respect to ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"zQO8D1hbNn"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"VπV^\\piVπ","key":"yLZoS0kIvt"},{"type":"text","value":").\nAbove, we saw how MCTS can be thought of as a “policy improvement” operation:\nfor a given policy ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"LUe6bKC7zy"},{"type":"inlineMath","value":"\\pi^0","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"π0\\pi^0π0","key":"BoFJN2GC0G"},{"type":"text","value":",\nwe can use it to guide MCTS,\nresulting in an algorithm that is itself a policy ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"ZMUPGW7Xzb"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"SXhzcuwQG1"},{"type":"text","value":" that maps from states to actions.\nNow, we can use ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"hvvFcw4gcw"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"EsxZ4EZUGs"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"LE2E1unlJK"},{"type":"text","value":"\nto obtain a new policy ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"dVYjdU71QR"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"π1\\pi^1π1","key":"CDTY4FY0fR"},{"type":"text","value":" that imitates ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"cSYZmIXzV1"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"wELvyU38zW"},{"type":"text","value":".\nWe can now use ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"lEmUelSRzH"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"π1\\pi^1π1","key":"aWfM3TQny2"},{"type":"text","value":" to guide MCTS,\nand repeat.","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"XOy6bTUlbp"}],"key":"BnuqnXAMoB"},{"type":"proof","kind":"algorithm","label":"mcts-self-play","identifier":"mcts-self-play","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"MCTS with self-play","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"MUAukC9X4Q"}],"key":"RTRoBS2RWa"},{"type":"paragraph","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"nno8UhdiCh"}],"key":"AjDa1t2ItM"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":567,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"children":[{"type":"text","value":"A parameterized policy class ","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"key":"Cr1yMgYlbv"},{"type":"inlineMath","value":"\\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"html":"πθ:S(A)\\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})πθ:S(A)","key":"fQqVSu8CjP"}],"key":"mv08EMrVMr"},{"type":"listItem","spread":true,"position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"children":[{"type":"text","value":"A parameterized value function class ","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"key":"G9oMpG89OF"},{"type":"inlineMath","value":"v_\\lambda : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"html":"vλ:SRv_\\lambda : \\mathcal{S} \\to \\mathbb{R}vλ:SR","key":"HC1weQ2fQb"}],"key":"PfV0MiRJ4h"},{"type":"listItem","spread":true,"position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"A number of trajectories ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"ttZnRpy68x"},{"type":"inlineMath","value":"M","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"MMM","key":"JeFBH2IEDv"},{"type":"text","value":" to generate","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"av6aLnkwRS"}],"key":"x54EPykCvn"},{"type":"listItem","spread":true,"position":{"start":{"line":570,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"text","value":"The initial parameters ","position":{"start":{"line":570,"column":1},"end":{"line":570,"column":1}},"key":"TkZgfT3Rpf"},{"type":"inlineMath","value":"\\theta^0, \\lambda^0","position":{"start":{"line":570,"column":1},"end":{"line":570,"column":1}},"html":"θ0,λ0\\theta^0, \\lambda^0θ0,λ0","key":"gcHkaWzVLd"}],"key":"r8YRA1NzCo"}],"key":"t3jXGgMeCH"},{"type":"paragraph","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"U8TfxVXeRJ"},{"type":"inlineMath","value":"t = 0, \\dots, T-1","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"html":"t=0,,T1t = 0, \\dots, T-1t=0,,T1","key":"Y0B1LvWIjC"},{"type":"text","value":":","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"oB1xRqofIA"}],"key":"mo7gDVle7c"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":574,"column":1},"end":{"line":580,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"strong","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"Policy improvement","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"uwPKukFE55"}],"key":"srAGk0QZyG"},{"type":"text","value":": Let ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"zkSDGv82wc"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"DwCiM7NPHC"},{"type":"text","value":" denote the policy obtained by ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"A82FkqIDuv"},{"type":"crossReference","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"Algorithm ","key":"ZPAZU9azIi"},{"type":"text","value":"8.3","key":"QdRqX0d4Tk"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.3","resolved":true,"html_id":"mcts-policy-value","key":"TGhm0sLryg"},{"type":"text","value":" with ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"IEUhwt0998"},{"type":"inlineMath","value":"\\pi_{\\theta^t}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"πθt\\pi_{\\theta^t}πθt","key":"MFdCaNZ2VX"},{"type":"text","value":" and ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"dVI6o4kdgX"},{"type":"inlineMath","value":"v_{\\lambda^t}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"vλtv_{\\lambda^t}vλt","key":"fzIMmsGCVN"},{"type":"text","value":". We use ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"dFLXKDLn82"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"grX7RLIL2a"},{"type":"text","value":" to play against itself ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"azIHeRjgvH"},{"type":"inlineMath","value":"M","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"MMM","key":"i7miUF36EF"},{"type":"text","value":" times. This generates ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"NB3GLN2GIB"},{"type":"inlineMath","value":"M","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"MMM","key":"LsgKTyIrVJ"},{"type":"text","value":" trajectories ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"RDK7k5Uqfz"},{"type":"inlineMath","value":"\\tau_0, \\dots, \\tau_{M-1}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"τ0,,τM1\\tau_0, \\dots, \\tau_{M-1}τ0,,τM1","key":"X6de0kt4MO"},{"type":"text","value":".","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"aIP5fmAtOA"}],"key":"BhCXJtqUQr"},{"type":"listItem","spread":true,"position":{"start":{"line":575,"column":1},"end":{"line":580,"column":1}},"children":[{"type":"strong","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"children":[{"type":"text","value":"Policy evaluation","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"bH0HOSQaR0"}],"key":"wy7H995at5"},{"type":"text","value":": Use behavioral cloning to find a set of policy parameters ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"VhewlV3JwW"},{"type":"inlineMath","value":"\\theta^{t+1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"θt+1\\theta^{t+1}θt+1","key":"vBfb6qUcG8"},{"type":"text","value":" that mimic the behavior of ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"VvfjsTx7Hh"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"dWovUIb2hQ"},{"type":"text","value":" and a set of value function parameters ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"Gkpf4Tl83K"},{"type":"inlineMath","value":"\\lambda^{t+1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"λt+1\\lambda^{t+1}λt+1","key":"VPUpoorFY3"},{"type":"text","value":" that approximate its value function. That is,","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"NplBq6cMEx"},{"type":"math","tight":"before","value":"\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}","position":{"start":{"line":576,"column":1},"end":{"line":578,"column":1}},"html":"θt+1arg minθm=0M1h=0H1logπθ(ahmshm)λt+1arg minλm=0M1h=0H1(vλ(shm)R(τm))2\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}θt+1λt+1θargminm=0M1h=0H1logπθ(ahmshm)λargminm=0M1h=0H1(vλ(shm)R(τm))2","enumerator":"8.6","key":"ks1T07oR2m"}],"key":"B8AdADI31M"}],"key":"rTKaQ4aFsT"},{"type":"paragraph","position":{"start":{"line":581,"column":1},"end":{"line":584,"column":1}},"children":[{"type":"text","value":"Note that in implementation,\nthe policy and value are typically both returned by a single deep neural network,\nthat is, with a single set of parameters,\nand the two loss functions are added together.","position":{"start":{"line":581,"column":1},"end":{"line":581,"column":1}},"key":"msSzeO93S9"}],"key":"aQwe599tTF"}],"enumerator":"8.4","html_id":"mcts-self-play","key":"mHfQoaZP3z"},{"type":"paragraph","position":{"start":{"line":587,"column":1},"end":{"line":587,"column":1}},"children":[{"type":"text","value":"This algorithm was brought to fame by AlphaGo Zero ","position":{"start":{"line":587,"column":1},"end":{"line":587,"column":1}},"key":"SQBMALrhKE"},{"type":"cite","kind":"narrative","label":"silver_mastering_2017","identifier":"silver_mastering_2017","children":[{"type":"text","value":"Silver ","key":"TnSXFUsHpz"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"RelYGHTDbV"}],"key":"WpGgLPksyx"},{"type":"text","value":" (2017)","key":"GbfmEKOdBB"}],"enumerator":"2","key":"uk9fxVA6yJ"},{"type":"text","value":".","position":{"start":{"line":587,"column":1},"end":{"line":587,"column":1}},"key":"kc9PBnuzJ5"}],"key":"hYXX5yy6SY"},{"type":"heading","depth":2,"position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"iRkG9rMCY2"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"8.6","key":"cxC9ZHTdpP"},{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":598,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored tree search-based algorithms for deterministic, zero sum, fully observable two-player games.\nWe began with ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"RJeS3klJBa"},{"type":"crossReference","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"min-max search","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"zPhFP7jsxh"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"JyKSAmx81t"},{"type":"text","value":",\nan algorithm for exactly solving the game value of every possible state.\nHowever, this is impossible to execute in practice,\nand so we must resort to various ways to reduce the number of states and actions that we must explore.\n","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"M2h2crDeTf"},{"type":"crossReference","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"ttyznV8CrH"}],"identifier":"alpha-beta-search","label":"alpha-beta-search","kind":"heading","template":"Section %s","enumerator":"8.4","resolved":true,"html_id":"alpha-beta-search","key":"MFYA2YzgRp"},{"type":"text","value":" does this by ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"kkTXlev6oM"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"pruning","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"wqSD0zp3eu"}],"key":"fLDv0SCc0B"},{"type":"text","value":" away states that we already know to be suboptimal,\nand ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"uewAUbnJGQ"},{"type":"crossReference","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"kvmDqMNMqT"}],"identifier":"monte-carlo-tree-search","label":"monte-carlo-tree-search","kind":"heading","template":"Section %s","enumerator":"8.5","resolved":true,"html_id":"monte-carlo-tree-search","key":"GBTL7rnVmW"},{"type":"text","value":" ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"hXDkfz1Suj"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"approximates","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"JhphtQKiAC"}],"key":"JijyY9szTK"},{"type":"text","value":" the value of states instead of evaluating them exactly.","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"JHOhHySA2I"}],"key":"WMEpeRCtql"},{"type":"heading","depth":2,"position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"References","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"eyXZR5i7C5"}],"identifier":"references","label":"References","html_id":"references","implicit":true,"enumerator":"8.7","key":"cd565GADLB"},{"type":"paragraph","position":{"start":{"line":603,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"Chapter 5 of ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"JOdi0dL7Z1"},{"type":"cite","kind":"narrative","label":"russell_artificial_2021","identifier":"russell_artificial_2021","children":[{"type":"text","value":"Russell & Norvig (2021)","key":"QqdI9UQpln"}],"enumerator":"3","key":"KhO9rVRneN"},{"type":"text","value":" provides an excellent overview of search methods in games.\nThe original AlphaGo paper ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"lnkToUpv9f"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"s6ylqlPXUQ"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"YV1Rh9avTW"}],"key":"fa9IYWavPn"},{"type":"text","value":" (2016)","key":"hHr7tYln1S"}],"enumerator":"1","key":"xWns1upxMt"},{"type":"text","value":" was a groundbreaking application of these technologies.\n","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"b4EP2372qn"},{"type":"cite","kind":"narrative","label":"silver_mastering_2017","identifier":"silver_mastering_2017","children":[{"type":"text","value":"Silver ","key":"Yhjtycn6HM"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"YKu8sxhPN3"}],"key":"YbBQkGvCVO"},{"type":"text","value":" (2017)","key":"leCyxm6yTm"}],"enumerator":"2","key":"Kz6MJosUsf"},{"type":"text","value":" removed the imitation learning phase,\nlearning from scratch.\nAlphaZero ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"KUdRAuN8fC"},{"type":"cite","kind":"narrative","label":"silver_general_2018","identifier":"silver_general_2018","children":[{"type":"text","value":"Silver ","key":"d3uG58KdmC"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"tViykXVPCG"}],"key":"texB8hbaec"},{"type":"text","value":" (2018)","key":"I8SdQCQ0fv"}],"enumerator":"4","key":"KYAA2Ip2QT"},{"type":"text","value":" then extended to other games beyond Go,\nnamely shogi and chess,\nalso learning from scratch.\nIn MuZero ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"PnJxt0YUrp"},{"type":"cite","kind":"narrative","label":"schrittwieser_mastering_2020","identifier":"schrittwieser_mastering_2020","children":[{"type":"text","value":"Schrittwieser ","key":"ZeIvEkzXRu"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"zYTOvoWD4y"}],"key":"FvdadOSeqO"},{"type":"text","value":" (2020)","key":"TSFSIImeHh"}],"enumerator":"5","key":"kriGRNbRPk"},{"type":"text","value":",\nthis was further extended by learning a model of the game dynamics.","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"o4yL9RSzpI"}],"key":"iqoLfZkaLA"}],"key":"ZSlU4QZdZa"}],"key":"FqxwqZe177"},"references":{"cite":{"order":["silver_mastering_2016","silver_mastering_2017","russell_artificial_2021","silver_general_2018","schrittwieser_mastering_2020"],"data":{"silver_mastering_2016":{"label":"silver_mastering_2016","enumerator":"1","doi":"10.1038/nature16961","html":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., van den Driessche, G., Schrittwieser, J., Antonoglou, I., Panneershelvam, V., Lanctot, M., Dieleman, S., Grewe, D., Nham, J., Kalchbrenner, N., Sutskever, I., Lillicrap, T., Leach, M., Kavukcuoglu, K., Graepel, T., & Hassabis, D. (2016). Mastering the Game of Go with Deep Neural Networks and Tree Search. Nature, 529(7587), 484–489. 10.1038/nature16961","url":"https://doi.org/10.1038/nature16961"},"silver_mastering_2017":{"label":"silver_mastering_2017","enumerator":"2","doi":"10.1038/nature24270","html":"Silver, D., Schrittwieser, J., Simonyan, K., Antonoglou, I., Huang, A., Guez, A., Hubert, T., Baker, L., Lai, M., Bolton, A., Chen, Y., Lillicrap, T., Hui, F., Sifre, L., van den Driessche, G., Graepel, T., & Hassabis, D. (2017). Mastering the Game of Go without Human Knowledge. Nature, 550(7676), 354–359. 10.1038/nature24270","url":"https://doi.org/10.1038/nature24270"},"russell_artificial_2021":{"label":"russell_artificial_2021","enumerator":"3","html":"Russell, S. J., & Norvig, P. (2021). Artificial Intelligence: A Modern Approach (Fourth edition). Pearson."},"silver_general_2018":{"label":"silver_general_2018","enumerator":"4","doi":"10.1126/science.aar6404","html":"Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, D., Graepel, T., Lillicrap, T., Simonyan, K., & Hassabis, D. (2018). A General Reinforcement Learning Algorithm That Masters Chess, Shogi, and Go through Self-Play. Science, 362(6419), 1140–1144. 10.1126/science.aar6404","url":"https://doi.org/10.1126/science.aar6404"},"schrittwieser_mastering_2020":{"label":"schrittwieser_mastering_2020","enumerator":"5","doi":"10.1038/s41586-020-03051-4","html":"Schrittwieser, J., Antonoglou, I., Hubert, T., Simonyan, K., Sifre, L., Schmitt, S., Guez, A., Lockhart, E., Hassabis, D., Graepel, T., Lillicrap, T., & Silver, D. (2020). Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model. Nature, 588(7839), 604–609. 10.1038/s41586-020-03051-4","url":"https://doi.org/10.1038/s41586-020-03051-4"}}}},"footer":{"navigation":{"prev":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/supervised-learning.html b/supervised-learning.html index 8761c7a..f71ebbd 100644 --- a/supervised-learning.html +++ b/supervised-learning.html @@ -14,17 +14,17 @@ ); root.querySelectorAll(".hide-mac").forEach(node => {node.classList.add(isMac ? "hidden" : "block")}); root.querySelectorAll(".show-mac").forEach(node => {node.classList.add(!isMac ? "hidden" : "block")}); -})()

    4 Supervised learning

    4.1Introduction

    This section will cover the details of implementing the fit function above: That is, how to use a dataset of labelled samples (x1,y1),,(xN,yN)(x_1, y_1), \dots, (x_N, y_N) to find a function ff that minimizes the empirical risk. This requires two ingredients:

    1. A function class F\mathcal{F} to search over
    2. A fitting method for minimizing the empirical risk over this class

    The two main function classes we will cover are linear models and neural networks. Both of these function classes are parameterized by some parameters θ, and the fitting method will search over these parameters to minimize the empirical risk:

    The most common fitting method for parameterized models is gradient descent.

    from jaxtyping import Float, Array
    -from collections.abc import Callable
    Params = Float[Array, " D"]
    +we to find a parameter (vector) θ^\hat \theta that minimizes the empirical risk:

    θ^=argminθ1Ni=1N(yifθ(xi))2\hat \theta = \arg\min_{\theta} \frac{1}{N} \sum_{i=1}^N (y_i - f_\theta(x_i))^2

    The most common fitting method for parameterized models is gradient descent.

    from jaxtyping import Float, Array
    +from collections.abc import Callable
    Params = Float[Array, " D"]
     
     
     def gradient_descent(
    @@ -40,26 +40,26 @@
         θ = θ_init
         for _ in range(epochs):
             θ = θ - η * grad(loss)(θ)
    -    return θ

    4.2Linear regression

    In linear regression, we assume that the function ff is linear in the parameters:

    F={xθxθRD}\mathcal{F} = \{ x \mapsto \theta^\top x \mid \theta \in \mathbb{R}^D \}

    This function class is extremely simple and only contains linear functions. + return θ

    4.2Linear regression

    In linear regression, we assume that the function ff is linear in the parameters:

    F={xθxθRD}\mathcal{F} = \{ x \mapsto \theta^\top x \mid \theta \in \mathbb{R}^D \}

    This function class is extremely simple and only contains linear functions. To expand its expressivity, we can transform the input xx using some feature function ϕ, i.e. x~=ϕ(x)\widetilde x = \phi(x), and then fit a linear model in the transformed space instead.

    def fit_linear(X: Float[Array, "N D"], y: Float[Array, " N"], φ=lambda x: x):
    +-68.267.847-113-73.952-191-73.952z'/>=ϕ(x), and then fit a linear model in the transformed space instead.

    def fit_linear(X: Float[Array, "N D"], y: Float[Array, " N"], φ=lambda x: x):
         """Fit a linear model to the given dataset using ordinary least squares."""
         X = vmap(φ)(X)
         θ = np.linalg.lstsq(X, y, rcond=None)[0]
    -    return lambda x: np.dot(φ(x), θ)

    4.3Neural networks

    In neural networks, we assume that the function ff is a composition of linear functions (represented by matrices WiW_i) and non-linear activation functions (denoted by σ):

    F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\mathcal{F} = \{ x \mapsto \sigma(W_L \sigma(W_{L-1} \dots \sigma(W_1 x + b_1) \dots + b_{L-1}) + b_L) \}

    where WiRDi+1×DiW_i \in \mathbb{R}^{D_{i+1} \times D_i} and biRDi+1b_i \in \mathbb{R}^{D_{i+1}} are the parameters of the ii-th layer, and σ is the activation function.

    This function class is much more expressive and contains many more parameters. + return lambda x: np.dot(φ(x), θ)

    4.3Neural networks

    In neural networks, we assume that the function ff is a composition of linear functions (represented by matrices WiW_i) and non-linear activation functions (denoted by σ):

    F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\mathcal{F} = \{ x \mapsto \sigma(W_L \sigma(W_{L-1} \dots \sigma(W_1 x + b_1) \dots + b_{L-1}) + b_L) \}

    where WiRDi+1×DiW_i \in \mathbb{R}^{D_{i+1} \times D_i} and biRDi+1b_i \in \mathbb{R}^{D_{i+1}} are the parameters of the ii-th layer, and σ is the activation function.

    This function class is much more expressive and contains many more parameters. This makes it more susceptible to overfitting on smaller datasets, but also allows it to represent more complex functions. In practice, however, neural networks exhibit interesting phenomena during training, and are often able to generalize well even with many parameters.

    Another reason for their popularity is the efficient backpropagation algorithm for computing the gradient of the empirical risk with respect to the parameters. Essentially, the hierarchical structure of the neural network, i.e. computing the output of the network as a composition of functions, -allows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.

    Nielsen (2015) provides a comprehensive introduction to neural networks and backpropagation.

    References
    1. Nielsen, M. A. (2015). Neural Networks and Deep Learning. Determination Press.
    \ No newline at end of file diff --git a/supervised-learning.json b/supervised-learning.json index 5ab7cd4..3d3bc92 100644 --- a/supervised-learning.json +++ b/supervised-learning.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"e56ff69c011ee78674304db47cc4e85c51d95181fd2f1ca46bac12965fb5e8ee","slug":"supervised-learning","location":"/supervised_learning.md","dependencies":[],"frontmatter":{"title":"4 Supervised learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"4.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"supervised_learning.md","url":"/build/supervised_learning-350bcacee6e0c7c9985fcefbbc20f999.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"cnD4oxe07m"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"4.1","key":"o69OPJ23UU"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"This section will cover the details of implementing the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"yHdT1zSspE"},{"type":"inlineCode","value":"fit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"cr9sAlwiRt"},{"type":"text","value":" function above:\nThat is, how to use a dataset of labelled samples ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ox9xkXo2Ab"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"pnZgX3EYQl"},{"type":"text","value":" to find a function ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"GtsrCDJsOF"},{"type":"inlineMath","value":"f","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"fff","key":"EiMX4TDFuG"},{"type":"text","value":" that minimizes the empirical risk.\nThis requires two ingredients:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"qjHYtXya81"}],"key":"GqSBZPGmhX"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":24,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"XgUrNZzCK6"},{"type":"strong","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"function class","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"lFBCLzML80"}],"key":"exvXWRc5aJ"},{"type":"text","value":" ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"E5QFkpXJ5d"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"html":"F\\mathcal{F}F","key":"ufyJIB17XW"},{"type":"text","value":" to search over","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"gDjy74NOOQ"}],"key":"W3yJefKnYk"},{"type":"listItem","spread":true,"position":{"start":{"line":25,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"pJTIbbVMaO"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"cya4Mv4g44"}],"key":"vX5B3M3C9d"},{"type":"text","value":" for minimizing the empirical risk over this class","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"z2QN3OMInT"}],"key":"aqclVJmvYg"}],"key":"ajdriPPfeK"},{"type":"paragraph","position":{"start":{"line":27,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"The two main function classes we will cover are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"Ot9zZrC1sh"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"linear models","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"fMZhe1MvJ9"}],"key":"LDcXl628jo"},{"type":"text","value":" and ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"aQ7WI8TbC5"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"neural networks","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"U1FfP60FTJ"}],"key":"unyxCMCJgj"},{"type":"text","value":".\nBoth of these function classes are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"WLxdNBho8T"},{"type":"emphasis","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"parameterized","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"gprbWDHxAg"}],"key":"SWJARMQl5v"},{"type":"text","value":" by some parameters ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"uACY9PXuo3"},{"type":"text","value":"θ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"QuHJnkrpmb"},{"type":"text","value":",\nand the fitting method will search over these parameters to minimize the empirical risk:","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"VroZVbHZ0y"}],"key":"zJMUOcL5b5"},{"type":"proof","kind":"definition","label":"parameterized_empirical_risk_minimization","identifier":"parameterized_empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Parameterized empirical risk minimization","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"CYuO3pPfck"}],"key":"wP1GxVUwH6"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"mlqBBhJdhL"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"QGc32JF4wb"},{"type":"text","value":" and a class of functions ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"qMtZ0qiUKH"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"F\\mathcal{F}F","key":"wssnDuJ0mu"},{"type":"text","value":" parameterized by ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"zkcI5fhZa3"},{"type":"text","value":"θ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"FSAoUlHVwn"},{"type":"text","value":",\nwe to find a parameter (vector) ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"f4ifvGef64"},{"type":"inlineMath","value":"\\hat \\theta","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"θ^\\hat \\thetaθ^","key":"WYTjQPaIUc"},{"type":"text","value":" that minimizes the empirical risk:","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"CmuUsiUnbe"}],"key":"PjTTO1eM3p"},{"type":"math","value":"\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2","position":{"start":{"line":37,"column":1},"end":{"line":39,"column":1}},"html":"θ^=argminθ1Ni=1N(yifθ(xi))2\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2θ^=argθminN1i=1N(yifθ(xi))2","enumerator":"4.1","key":"YPcuLMuKWd"}],"enumerator":"4.1","html_id":"parameterized-empirical-risk-minimization","key":"phWVGUETAN"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"The most common fitting method for parameterized models is ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"O7FaWFJtKU"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"gradient descent","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"XMWGA4t7iD"}],"key":"tzTVZFUgw0"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hsrs6NqW09"}],"key":"xmI21GJknp"},{"type":"proof","kind":"definition","label":"gd_def","identifier":"gd_def","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient descent","position":{"start":{"line":44,"column":1},"end":{"line":44,"column":1}},"key":"riKJiP5RPw"}],"key":"EkRnPx0kGU"},{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Letting ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"FReVKtCLyu"},{"type":"inlineMath","value":"L(\\theta) \\in \\mathbb{R}","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"L(θ)RL(\\theta) \\in \\mathbb{R}L(θ)R","key":"s3C2ay3RAV"},{"type":"text","value":" denote the empirical risk in terms of the parameters,\nthe gradient descent algorithm updates the parameters according to the rule","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"Us29pDtKSe"}],"key":"MCsl68xevL"},{"type":"math","value":"\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)","position":{"start":{"line":50,"column":1},"end":{"line":52,"column":1}},"html":"θt+1=θtηθL(θt)\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)θt+1=θtηθL(θt)","enumerator":"4.2","key":"sJYNaConMD"},{"type":"paragraph","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"rCtJfWpNa9"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"html":"η>0\\eta > 0η>0","key":"m6lTjQtz7B"},{"type":"text","value":" is the ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"wE8UpRvSPy"},{"type":"strong","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"learning rate","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"JXabMw1Ckm"}],"key":"qeLKNkig0o"},{"type":"text","value":".","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"YJe4RmT6go"}],"key":"WEvjpkwNLo"}],"enumerator":"4.2","html_id":"gd-def","key":"fwvdWbuU9c"}],"key":"lM0Iyqfbf5"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nfrom collections.abc import Callable","visibility":"hide","key":"KwA9xhKpa6"},{"type":"output","id":"RumHamPSt5FClTgPtu9Uz","data":[],"visibility":"show","key":"SQiG2pPqBp"}],"data":{"tags":[]},"visibility":"show","key":"beTWvPTOMK"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"Params = Float[Array, \" D\"]\n\n\ndef gradient_descent(\n loss: Callable[[Params], float],\n θ_init: Params,\n η: float,\n epochs: int,\n):\n \"\"\"\n Run gradient descent to minimize the given loss function\n (expressed in terms of the parameters).\n \"\"\"\n θ = θ_init\n for _ in range(epochs):\n θ = θ - η * grad(loss)(θ)\n return θ","key":"vKfeN2ZMMu"},{"type":"output","id":"9_6RdxyNU1eOe0LSZzPG3","data":[],"key":"jx7TGvLLWM"}],"data":{},"key":"pmuq0nbiEW"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"Linear regression","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"S6TURh7hIZ"}],"identifier":"linear-regression","label":"Linear regression","html_id":"linear-regression","implicit":true,"enumerator":"4.2","key":"Kul8RdWgd3"},{"type":"paragraph","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"In linear regression, we assume that the function ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"lMaNW5vyYr"},{"type":"inlineMath","value":"f","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"fff","key":"v1KLFFF1cK"},{"type":"text","value":" is linear in the parameters:","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"fpRgYoCddn"}],"key":"JuZoVvR7BI"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}","position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"html":"F={xθxθRD}\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}F={xθxθRD}","enumerator":"4.3","key":"jtdJrSUc12"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"This function class is extremely simple and only contains linear functions.\nTo expand its expressivity, we can ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"nzU8myu0o5"},{"type":"emphasis","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"transform","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"odCfsFjOti"}],"key":"YTneqasMpH"},{"type":"text","value":" the input ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"zQV2l1NSVY"},{"type":"inlineMath","value":"x","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"xxx","key":"WI0DsY5bJX"},{"type":"text","value":" using some feature function ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"O1mrwQIKIv"},{"type":"text","value":"ϕ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"EGXAGiYEvK"},{"type":"text","value":",\ni.e. ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"YhspUt0Sue"},{"type":"inlineMath","value":"\\widetilde x = \\phi(x)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"x~=ϕ(x)\\widetilde x = \\phi(x)x=ϕ(x)","key":"dV5VZTP2v2"},{"type":"text","value":", and then fit a linear model in the transformed space instead.","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"JpfA3CWkIX"}],"key":"w4fsTW4ChB"}],"key":"U6eop8gcOz"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fit_linear(X: Float[Array, \"N D\"], y: Float[Array, \" N\"], φ=lambda x: x):\n \"\"\"Fit a linear model to the given dataset using ordinary least squares.\"\"\"\n X = vmap(φ)(X)\n θ = np.linalg.lstsq(X, y, rcond=None)[0]\n return lambda x: np.dot(φ(x), θ)","key":"V4dRDaKVuT"},{"type":"output","id":"XEcNADF2ZWtlOIzeYC_7a","data":[],"key":"qbe2oLK1kT"}],"data":{},"key":"gwQbdQg2NI"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"Neural networks","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"cT9gLDdsM3"}],"identifier":"neural-networks","label":"Neural networks","html_id":"neural-networks","implicit":true,"enumerator":"4.3","key":"Aahc8wCBgO"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"In neural networks, we assume that the function ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"GL7Lir5nQW"},{"type":"inlineMath","value":"f","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"fff","key":"kCRIFhMABX"},{"type":"text","value":" is a composition of linear functions (represented by matrices ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"q7PLjUhixd"},{"type":"inlineMath","value":"W_i","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"WiW_iWi","key":"RIi5sbey18"},{"type":"text","value":") and non-linear activation functions (denoted by ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"yGw9vJg0V7"},{"type":"text","value":"σ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"fVIYK7OXAc"},{"type":"text","value":"):","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"H8djQfefEE"}],"key":"UfS9boFloJ"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}","position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"html":"F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}","enumerator":"4.4","key":"O2Pvboae4U"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"gkOu3lOLgO"},{"type":"inlineMath","value":"W_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"WiRDi+1×DiW_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}WiRDi+1×Di","key":"aNZ2CjGTWA"},{"type":"text","value":" and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"Nc1LU0q3se"},{"type":"inlineMath","value":"b_i \\in \\mathbb{R}^{D_{i+1}}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"biRDi+1b_i \\in \\mathbb{R}^{D_{i+1}}biRDi+1","key":"KqDtu2buQd"},{"type":"text","value":" are the parameters of the ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"DvZwg2BPKL"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"vuxZ8gGnjV"},{"type":"text","value":"-th layer, and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"WF2DJUsIWR"},{"type":"text","value":"σ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"m6Y5ci8Ak1"},{"type":"text","value":" is the activation function.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"cPHTQk3LND"}],"key":"gZwXiac0wL"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"This function class is much more expressive and contains many more parameters.\nThis makes it more susceptible to overfitting on smaller datasets,\nbut also allows it to represent more complex functions.\nIn practice, however, neural networks exhibit interesting phenomena during training,\nand are often able to generalize well even with many parameters.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"OyJ5ro5aT3"}],"key":"b9TFD15737"},{"type":"paragraph","position":{"start":{"line":120,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"Another reason for their popularity is the efficient ","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"cLRcTV0AGK"},{"type":"strong","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"backpropagation","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"N1sbNvqnPD"}],"key":"MeRZ5CJnSl"},{"type":"text","value":" algorithm for computing the gradient of the empirical risk with respect to the parameters.\nEssentially, the hierarchical structure of the neural network,\ni.e. computing the output of the network as a composition of functions,\nallows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"B4jwoz6jFg"}],"key":"L9ql8JgFE9"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"cite","kind":"narrative","label":"nielsen_neural_2015","identifier":"nielsen_neural_2015","children":[{"type":"text","value":"Nielsen (2015)","key":"Am5tj69XQl"}],"enumerator":"1","key":"opV8naidym"},{"type":"text","value":" provides a comprehensive introduction to neural networks and backpropagation.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"YtuoyUJ2gJ"}],"key":"eCjfpWj3O9"}],"key":"xx997w1Cjp"}],"key":"Ba87jtbIQS"},"references":{"cite":{"order":["nielsen_neural_2015"],"data":{"nielsen_neural_2015":{"label":"nielsen_neural_2015","enumerator":"1","html":"Nielsen, M. A. (2015). Neural Networks and Deep Learning. Determination Press."}}}},"footer":{"navigation":{"prev":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"e56ff69c011ee78674304db47cc4e85c51d95181fd2f1ca46bac12965fb5e8ee","slug":"supervised-learning","location":"/supervised_learning.md","dependencies":[],"frontmatter":{"title":"4 Supervised learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"4.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"supervised_learning.md","url":"/build/supervised_learning-350bcacee6e0c7c9985fcefbbc20f999.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"KD1iZgJ5vg"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"4.1","key":"pebQqo8HZD"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"This section will cover the details of implementing the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"zA4wO8ch9l"},{"type":"inlineCode","value":"fit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ODNmkwQUtI"},{"type":"text","value":" function above:\nThat is, how to use a dataset of labelled samples ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"LylV3btd5G"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"VBq3lYaj2L"},{"type":"text","value":" to find a function ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Vh1mMtiSYm"},{"type":"inlineMath","value":"f","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"fff","key":"MV1ZyJhB86"},{"type":"text","value":" that minimizes the empirical risk.\nThis requires two ingredients:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"G6VQM7GD4o"}],"key":"O2EAmbuJhl"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":24,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"V2v5HGPsOE"},{"type":"strong","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"function class","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"Ei7YgCnN7Y"}],"key":"sL3HSZh7h6"},{"type":"text","value":" ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"h5PO1UFqT4"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"html":"F\\mathcal{F}F","key":"LAOFkXgjni"},{"type":"text","value":" to search over","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"T0CGelH1wN"}],"key":"pEqRF7Xf5R"},{"type":"listItem","spread":true,"position":{"start":{"line":25,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"KFRuvJI8vc"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"yklpYpC13J"}],"key":"wwxufNKvnb"},{"type":"text","value":" for minimizing the empirical risk over this class","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"nL29nQWsBi"}],"key":"Ypnzx5ARiS"}],"key":"EkeckTRHez"},{"type":"paragraph","position":{"start":{"line":27,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"The two main function classes we will cover are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"wEgrdHG0qs"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"linear models","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"QE3fISrtjV"}],"key":"gn5wkDN4MA"},{"type":"text","value":" and ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"alTOrrC0Nl"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"neural networks","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"A7bIpXFfZl"}],"key":"bClu4CDnZb"},{"type":"text","value":".\nBoth of these function classes are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"Gk76XO3FBb"},{"type":"emphasis","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"parameterized","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"xTzWSGbbIn"}],"key":"YyrccUpKsH"},{"type":"text","value":" by some parameters ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"oVTnE3Ktdg"},{"type":"text","value":"θ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"wwUxz9J4ra"},{"type":"text","value":",\nand the fitting method will search over these parameters to minimize the empirical risk:","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"TS8Wijegzw"}],"key":"kxtmyCJXkb"},{"type":"proof","kind":"definition","label":"parameterized_empirical_risk_minimization","identifier":"parameterized_empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Parameterized empirical risk minimization","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"yu9nXesKL4"}],"key":"DxGd2PHTCY"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"Ye9L27kVId"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"SNlNsPWdRm"},{"type":"text","value":" and a class of functions ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"p6S1Ds5fN3"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"F\\mathcal{F}F","key":"nivVJ54weK"},{"type":"text","value":" parameterized by ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"X9roMHlmOu"},{"type":"text","value":"θ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"gjEnyikAD7"},{"type":"text","value":",\nwe to find a parameter (vector) ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"NK9yrFSw3Z"},{"type":"inlineMath","value":"\\hat \\theta","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"θ^\\hat \\thetaθ^","key":"UnhKKNWb2w"},{"type":"text","value":" that minimizes the empirical risk:","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"DoujyfzkE6"}],"key":"DMcZFdYkpM"},{"type":"math","value":"\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2","position":{"start":{"line":37,"column":1},"end":{"line":39,"column":1}},"html":"θ^=argminθ1Ni=1N(yifθ(xi))2\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2θ^=argθminN1i=1N(yifθ(xi))2","enumerator":"4.1","key":"BEFH03QLqJ"}],"enumerator":"4.1","html_id":"parameterized-empirical-risk-minimization","key":"nJ4vn8bQyQ"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"The most common fitting method for parameterized models is ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"sxbkUynbFo"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"gradient descent","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hr38c6c8Av"}],"key":"A7IIRqQC8D"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"l5yX9IaxUM"}],"key":"BRsa570YwW"},{"type":"proof","kind":"definition","label":"gd_def","identifier":"gd_def","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient descent","position":{"start":{"line":44,"column":1},"end":{"line":44,"column":1}},"key":"fxfQC4hU2q"}],"key":"fzAkyZH0ut"},{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Letting ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"d3BoljxG3I"},{"type":"inlineMath","value":"L(\\theta) \\in \\mathbb{R}","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"L(θ)RL(\\theta) \\in \\mathbb{R}L(θ)R","key":"jnJVjgUPpd"},{"type":"text","value":" denote the empirical risk in terms of the parameters,\nthe gradient descent algorithm updates the parameters according to the rule","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"CHvniuFkjW"}],"key":"LR63ZkhYeK"},{"type":"math","value":"\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)","position":{"start":{"line":50,"column":1},"end":{"line":52,"column":1}},"html":"θt+1=θtηθL(θt)\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)θt+1=θtηθL(θt)","enumerator":"4.2","key":"lrYFXjQf8U"},{"type":"paragraph","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"EgPPRQxfPE"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"html":"η>0\\eta > 0η>0","key":"Bk5f4DLgZ3"},{"type":"text","value":" is the ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"p5BHjk3n4n"},{"type":"strong","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"learning rate","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"StW4wRaClA"}],"key":"CZQwwvvzkc"},{"type":"text","value":".","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"ZotNUNgq9G"}],"key":"uo1Pj8o7wE"}],"enumerator":"4.2","html_id":"gd-def","key":"GblmsYnDxo"}],"key":"vPmMg3cnOH"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nfrom collections.abc import Callable","visibility":"hide","key":"FKNfaxNGVz"},{"type":"output","id":"BqAqQcLQ4CcDWuEd00PDF","data":[],"visibility":"show","key":"OBNLsfAxwv"}],"data":{"tags":[]},"visibility":"show","key":"NJ9oUs87lZ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"Params = Float[Array, \" D\"]\n\n\ndef gradient_descent(\n loss: Callable[[Params], float],\n θ_init: Params,\n η: float,\n epochs: int,\n):\n \"\"\"\n Run gradient descent to minimize the given loss function\n (expressed in terms of the parameters).\n \"\"\"\n θ = θ_init\n for _ in range(epochs):\n θ = θ - η * grad(loss)(θ)\n return θ","key":"WsT0xrl4X7"},{"type":"output","id":"7Jstr4NGR0mKGJP88uOhw","data":[],"key":"LNTNtSGo4M"}],"data":{},"key":"vLcXA3GWU0"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"Linear regression","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"DECjYrMxE3"}],"identifier":"linear-regression","label":"Linear regression","html_id":"linear-regression","implicit":true,"enumerator":"4.2","key":"Wia8dezJUQ"},{"type":"paragraph","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"In linear regression, we assume that the function ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"j7JirY01n1"},{"type":"inlineMath","value":"f","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"fff","key":"GeFBC7t1FZ"},{"type":"text","value":" is linear in the parameters:","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"pbqR7gCOSr"}],"key":"v86Qadv1ia"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}","position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"html":"F={xθxθRD}\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}F={xθxθRD}","enumerator":"4.3","key":"r1nbGPNSDR"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"This function class is extremely simple and only contains linear functions.\nTo expand its expressivity, we can ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"mKinbYPIti"},{"type":"emphasis","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"transform","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"uPOyVxk5oR"}],"key":"ho5mVUfhXV"},{"type":"text","value":" the input ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"DH8ch3IdEi"},{"type":"inlineMath","value":"x","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"xxx","key":"BNCAIciMht"},{"type":"text","value":" using some feature function ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"NyXYjAMutx"},{"type":"text","value":"ϕ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"DZxxxkMgL5"},{"type":"text","value":",\ni.e. ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"eg7BGVsW3z"},{"type":"inlineMath","value":"\\widetilde x = \\phi(x)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"x~=ϕ(x)\\widetilde x = \\phi(x)x=ϕ(x)","key":"eiFOsFYUqf"},{"type":"text","value":", and then fit a linear model in the transformed space instead.","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Sbs9TXmxm9"}],"key":"AJNcS5WLXB"}],"key":"a0tCqMAtet"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fit_linear(X: Float[Array, \"N D\"], y: Float[Array, \" N\"], φ=lambda x: x):\n \"\"\"Fit a linear model to the given dataset using ordinary least squares.\"\"\"\n X = vmap(φ)(X)\n θ = np.linalg.lstsq(X, y, rcond=None)[0]\n return lambda x: np.dot(φ(x), θ)","key":"iwDc5j1hCF"},{"type":"output","id":"dAgcGS3_T-pNb4KJwR9-N","data":[],"key":"knQduzvK9q"}],"data":{},"key":"hasPo7pBKp"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"Neural networks","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"vJVUPSCGxd"}],"identifier":"neural-networks","label":"Neural networks","html_id":"neural-networks","implicit":true,"enumerator":"4.3","key":"muMSTjfhuI"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"In neural networks, we assume that the function ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"nXJpDDRUQq"},{"type":"inlineMath","value":"f","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"fff","key":"NOMRIa5Trz"},{"type":"text","value":" is a composition of linear functions (represented by matrices ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"Y86v66uhOA"},{"type":"inlineMath","value":"W_i","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"WiW_iWi","key":"b5jrt25QDf"},{"type":"text","value":") and non-linear activation functions (denoted by ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"RyaR8Gza3S"},{"type":"text","value":"σ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"njSjMZnVWP"},{"type":"text","value":"):","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"arVWrzgeqW"}],"key":"f1HVvfQgG3"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}","position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"html":"F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}","enumerator":"4.4","key":"kDgvlP8JFy"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"EvGpnVVQN6"},{"type":"inlineMath","value":"W_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"WiRDi+1×DiW_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}WiRDi+1×Di","key":"jb02pl9uiB"},{"type":"text","value":" and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"WchxlwAKUu"},{"type":"inlineMath","value":"b_i \\in \\mathbb{R}^{D_{i+1}}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"biRDi+1b_i \\in \\mathbb{R}^{D_{i+1}}biRDi+1","key":"VA6LoQ4ndl"},{"type":"text","value":" are the parameters of the ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"LPVmx7pjJR"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"iEXE9yTOoG"},{"type":"text","value":"-th layer, and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"ajs9ucUe6Z"},{"type":"text","value":"σ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"vyOl0SWDux"},{"type":"text","value":" is the activation function.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"UVbVcryVrw"}],"key":"LQpaNw6FqF"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"This function class is much more expressive and contains many more parameters.\nThis makes it more susceptible to overfitting on smaller datasets,\nbut also allows it to represent more complex functions.\nIn practice, however, neural networks exhibit interesting phenomena during training,\nand are often able to generalize well even with many parameters.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"GxvFR2x0dT"}],"key":"igBd6gTFXE"},{"type":"paragraph","position":{"start":{"line":120,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"Another reason for their popularity is the efficient ","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"NRMG6cYDNn"},{"type":"strong","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"backpropagation","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"q93KDRlXfB"}],"key":"aCVELtHlbA"},{"type":"text","value":" algorithm for computing the gradient of the empirical risk with respect to the parameters.\nEssentially, the hierarchical structure of the neural network,\ni.e. computing the output of the network as a composition of functions,\nallows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"XGVHP3v4cR"}],"key":"KbBiKo0Zt9"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"cite","kind":"narrative","label":"nielsen_neural_2015","identifier":"nielsen_neural_2015","children":[{"type":"text","value":"Nielsen (2015)","key":"LT4pPadDPN"}],"enumerator":"1","key":"AqLgREm4LX"},{"type":"text","value":" provides a comprehensive introduction to neural networks and backpropagation.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"q7eHBesTv9"}],"key":"iPMKoNAMlQ"}],"key":"HtRhzYXIsI"}],"key":"a3iaY0WL4C"},"references":{"cite":{"order":["nielsen_neural_2015"],"data":{"nielsen_neural_2015":{"label":"nielsen_neural_2015","enumerator":"1","html":"Nielsen, M. A. (2015). Neural Networks and Deep Learning. Determination Press."}}}},"footer":{"navigation":{"prev":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file